Esempio n. 1
0
class Wikiparser(HTMLParser):
    def __init__(self, url, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        HTMLParser.__init__(self)
        self.hyperlinks = []
        self.url = url
        self.language = detect_language(url)
        self.pdf = PDFWriter(urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4)
        header = Header(text_align=pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(urllib.unquote(self.url))
        self.pdf.set_header(header) 
        self.pdf.move_context(0, 500)
        h1 = Text(urllib.unquote(self.url.split("/")[-1]), font="serif", font_size=32) 
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        h2 = Text(urllib.unquote(self.url), font="serif", font_size=16) 
        h2.color = StandardColors.Blue
        self.pdf.add_text(h2)
        footer = Footer(text_align=pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()
        
    def reset(self):                              
        HTMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.table = False
        self.tr = False
        self.th = False
        self.td = False
        self.caption = False
        self.reference = False
	self.ref_counter = 0
        self.column_counter = 0
        self.current_counter = 0
        self.buffer = None
        self.sup = False
        
    def handle_data(self, data):
        if data.strip() == "": return
	if self.p or self.h1 or self.h2 or self.a or self.span or self.li or self.td or self.th or self.caption:
            if self.buffer != None:
                self.buffer += data
    def handle_starttag(self, tag, attrs):
        if tag == 'img'and not self.table:
            self.start_img(attrs)
        elif tag == 'h1':
            self.start_h1(attrs)
        elif tag == 'h2':
            self.start_h2(attrs)
        elif tag == 'li':
            self.start_li(attrs)
        elif tag == 'p':
            self.start_p(attrs)
        elif tag == 'a':
            self.start_a(attrs)
        elif tag == 'ul':
            self.start_ul(attrs)
        elif tag == 'ol':
            self.start_ol(attrs)
        elif tag == 'table':
            self.start_table(attrs)
        elif tag == 'tr' and self.table:
            self.start_tr(attrs)
        elif tag == 'td' and self.table:
            self.start_td(attrs)
        elif tag == 'th'and self.table:
            self.start_th(attrs)
        elif tag == 'caption' and self.table:
            self.start_caption(attrs)
        elif tag == 'span':
	    self.start_span(attrs)
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.reference == False and self.table == False:
               if self.buffer != None:
                  self.buffer += "<"+tag+">"
                  self.sup = True


    def handle_endtag(self, tag):
        if tag == 'img' and not self.table:
            self.end_img()
        elif tag == 'h1':
            self.end_h1()
        elif tag == 'h2':
            self.end_h2()
        elif tag == 'li':
            self.end_li()
        elif tag == 'p':
            self.end_p()
        elif tag == 'a':
            self.end_a()
        elif tag == 'ul':
            self.end_ul()
        elif tag == 'ol':
            self.end_ol()
        elif tag == 'table':
            self.end_table()
        elif tag == 'tr' and self.table:
            self.end_tr()
        elif tag == 'td' and self.table:
            self.end_td()
        elif tag == 'th' and self.table:
            self.end_th()
        elif tag == 'caption' and self.table:
            self.end_caption()
        elif tag == 'span':
            self.end_span()
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.sup and self.buffer != None:
                self.buffer += "</"+str(tag)+">"
        

    def start_img(self, attrs):         
        src = [value for key, value in attrs if key == 'src'] 
        if src:
            self.images.extend(src)
            
    def end_img(self):
        for wiki_image in self.images:
            image  = Image()  
            outpath = self.grab_image(wiki_image, "/tmp")
            image.set_image_file(outpath)
            self.pdf.add_image(image)
        self.images = []
        
    def start_h1(self, attrs):         
        self.h1 = True
        self.buffer = ""
        
    def end_h1(self):
        self.h1 = False
        h1 = Text(self.buffer, font="FreeSerif", font_size=16) 
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2 = True
        self.buffer = ""
        
    def end_h2(self):
        self.h2 = False
        if self.buffer and self.buffer.strip() > "":
            h2 = Text(self.buffer, font="FreeSerif", font_size=14) 
            h2.color = StandardColors.Blue
            self.pdf.add_text(h2)
        self.buffer = None
        
    def start_caption(self, attrs):         
        self.caption = True
        self.buffer = ""
        
    def end_caption(self):
        self.caption = False
        if self.buffer and self.buffer.strip() > "":
            caption = Text(self.buffer, font="FreeSerif", font_size=14) 
            caption.color = StandardColors.Blue
            self.pdf.add_text(caption)
        self.buffer = None

    def start_li(self, attrs):         
        self.li = True
        self.buffer = ""
        
    def end_li(self):
        self.li = False
#        print self.buffer
        if self.buffer and self.buffer.strip() > "":
            if self.ul:
                li = Text(markup = "• " + self.buffer,font="FreeSerif", font_size=10)
            elif self.ol:
                self.ref_counter+=1
                li = Text(markup = str(self.ref_counter) + ". "+ self.buffer.replace("↑",""), font = "FreeSerif", font_size=10)
            else:
                li = Text(markup = self.buffer,font="FreeSerif", font_size=10)     
            self.pdf.add_text(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False

    def start_table(self, attrs): 
        for tups in attrs:
	    if 'class' in tups:
		if tups[1] == 'wikitable':
                    self.table = True
                    self.wikitable = Table(border_width = 1)
                    self.wikitable.cell_padding = [2,2,2,2]
        
    def end_table(self):
        if self.table:
            self.table = False
            self.pdf.add_table(self.wikitable)

    def start_tr(self, attrs):         
        self.tr = True
        self.row = Row(height=25)
        self.current_counter = 0
        
    def end_tr(self):
        self.tr = False
        if self.current_counter == self.column_counter:
            self.wikitable.add_row(self.row)

    def start_td(self, attrs):         
        self.td = True
        self.buffer = ""
        
    def end_td(self):
        self.td = False
#        print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer,font_size=10)
        cell_content.color = Color(0.0,0.0,0.0,1.0)
        cell = Cell(cell_content, font_size=8,width=100)
        self.row.add_cell(cell)
        self.current_counter+=1
        self.buffer = None

    def start_th(self, attrs):         
        self.th = True
        self.buffer = ""
        
    def end_th(self):
        self.th = False
 #       print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer,font_size=10)
        cell_content.color = Color(0.0,0.0,0.0,1.0)
        cell = Cell(cell_content, font_size=8,width=100)
        self.row.add_cell(cell)
        self.column_counter+=1
        self.current_counter+=1
        self.buffer = None
    
#    def start_sup(self, attrs):         
#        self.sup = True
#        self.buffer += "<sup>"
#        
#    def end_sup(self):
#        print "test"
#        self.buffer += "</sup>"

        
    def start_ol(self, attrs):
        self.ol = True
        for tups in attrs:
	    if 'class' in tups:
		if tups[1] == 'references':
                    self.reference = True

    def end_ol(self):
        self.ol = False
        self.ref_counter = 0
        if self.reference:
            self.reference= False
            #self.sup = False
        
    def start_ul(self, attrs):
        self.ul = True    
    def end_ul(self):
        self.ul = False
            
    def start_span(self, attrs):         
        self.span = True
        if self.buffer == None:
            self.buffer = ""  
        
    def end_span(self):
        self.buffer += " "
        self.span = False
            
    def start_p(self, attrs):
        self.p = True
        self.buffer = ""
        
    def end_p(self) :
        self.p = False
        if self.sup:
            para = Paragraph(markup=self.buffer,text = self.buffer, font="FreeSerif", font_size=10,)
            self.sup = False
        else:
            #print self.buffer
            para = Paragraph(text=self.buffer, font="FreeSerif", font_size=10,)
           
        para.set_justify(True)
        if self.language:
            para.language = self.language
        else:
            para.language = None
            
        para.set_hyphenate(True)
        self.pdf.add_paragraph(para) 
#        f= open("computer_para.txt","aw")
#        f.write(self.buffer)
#        f.write("\n")
#        f.close()  
        self.buffer = None
    def set_header(self, text):
        self.header = text

    def grab_image(self, imageurl, outputfolder):
        """
        Get the image from wiki
        """
        output_filename = None
        try:
            link= imageurl.strip()
            parts = link.split("/")
            filename = parts[len(parts)-1]
            output_filename = os.path.join(outputfolder , filename)
            #output_filename=urllib.unquote(output_filename)
            print("GET IMAGE " + link + " ==> " + output_filename)
            if os.path.isfile(output_filename):
                print("File " + output_filename + " already exists")
                return output_filename
            opener = urllib2.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            infile = opener.open(link)
            page = infile.read()
            f= open(output_filename,"w")
            f.write(page)
            f.close()
        except KeyboardInterrupt:
            sys.exit()
        except urllib2.HTTPError:
            print("Error: Cound not download the image")
            pass
        return  output_filename
    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(self.url)
        page = infile.read()
        page = cleanup(page)
#        f= open("computer.txt","w")
#        f.write(page)
#        f.close()
#        f = open("computer.txt","r")
#        page=f.read()
#        f.close()
        "Parse the given string 's'."
        self.feed(page)
        self.close()
        self.pdf.flush()
Esempio n. 2
0
class Wikiparser(HTMLParser):
    def __init__(self, url, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        HTMLParser.__init__(self)
        self.hyperlinks = []
        self.url = url
        self.language = detect_language(url)
        self.pdf = PDFWriter(
            urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4)
        header = Header(text_align=pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(urllib.unquote(self.url))
        self.pdf.set_header(header)
        self.pdf.move_context(0, 500)
        h1 = Text(urllib.unquote(self.url.split("/")[-1]),
                  font="serif",
                  font_size=32)
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        h2 = Text(urllib.unquote(self.url), font="serif", font_size=16)
        h2.color = StandardColors.Blue
        self.pdf.add_text(h2)
        footer = Footer(text_align=pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()

    def reset(self):
        HTMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.table = False
        self.tr = False
        self.th = False
        self.td = False
        self.caption = False
        self.reference = False
        self.ref_counter = 0
        self.column_counter = 0
        self.current_counter = 0
        self.buffer = None
        self.sup = False

    def handle_data(self, data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span or self.li or self.td or self.th or self.caption:
            if self.buffer != None:
                self.buffer += data

    def handle_starttag(self, tag, attrs):
        if tag == 'img' and not self.table:
            self.start_img(attrs)
        elif tag == 'h1':
            self.start_h1(attrs)
        elif tag == 'h2':
            self.start_h2(attrs)
        elif tag == 'li':
            self.start_li(attrs)
        elif tag == 'p':
            self.start_p(attrs)
        elif tag == 'a':
            self.start_a(attrs)
        elif tag == 'ul':
            self.start_ul(attrs)
        elif tag == 'ol':
            self.start_ol(attrs)
        elif tag == 'table':
            self.start_table(attrs)
        elif tag == 'tr' and self.table:
            self.start_tr(attrs)
        elif tag == 'td' and self.table:
            self.start_td(attrs)
        elif tag == 'th' and self.table:
            self.start_th(attrs)
        elif tag == 'caption' and self.table:
            self.start_caption(attrs)
        elif tag == 'span':
            self.start_span(attrs)
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.reference == False and self.table == False:
                if self.buffer != None:
                    self.buffer += "<" + tag + ">"
                    self.sup = True

    def handle_endtag(self, tag):
        if tag == 'img' and not self.table:
            self.end_img()
        elif tag == 'h1':
            self.end_h1()
        elif tag == 'h2':
            self.end_h2()
        elif tag == 'li':
            self.end_li()
        elif tag == 'p':
            self.end_p()
        elif tag == 'a':
            self.end_a()
        elif tag == 'ul':
            self.end_ul()
        elif tag == 'ol':
            self.end_ol()
        elif tag == 'table':
            self.end_table()
        elif tag == 'tr' and self.table:
            self.end_tr()
        elif tag == 'td' and self.table:
            self.end_td()
        elif tag == 'th' and self.table:
            self.end_th()
        elif tag == 'caption' and self.table:
            self.end_caption()
        elif tag == 'span':
            self.end_span()
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.sup and self.buffer != None:
                self.buffer += "</" + str(tag) + ">"

    def start_img(self, attrs):
        src = [value for key, value in attrs if key == 'src']
        if src:
            self.images.extend(src)

    def end_img(self):
        for wiki_image in self.images:
            image = Image()
            outpath = self.grab_image(wiki_image, "/tmp")
            image.set_image_file(outpath)
            self.pdf.add_image(image)
        self.images = []

    def start_h1(self, attrs):
        self.h1 = True
        self.buffer = ""

    def end_h1(self):
        self.h1 = False
        h1 = Text(self.buffer, font="FreeSerif", font_size=16)
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        self.buffer = None

    def start_h2(self, attrs):
        self.h2 = True
        self.buffer = ""

    def end_h2(self):
        self.h2 = False
        if self.buffer and self.buffer.strip() > "":
            h2 = Text(self.buffer, font="FreeSerif", font_size=14)
            h2.color = StandardColors.Blue
            self.pdf.add_text(h2)
        self.buffer = None

    def start_caption(self, attrs):
        self.caption = True
        self.buffer = ""

    def end_caption(self):
        self.caption = False
        if self.buffer and self.buffer.strip() > "":
            caption = Text(self.buffer, font="FreeSerif", font_size=14)
            caption.color = StandardColors.Blue
            self.pdf.add_text(caption)
        self.buffer = None

    def start_li(self, attrs):
        self.li = True
        self.buffer = ""

    def end_li(self):
        self.li = False
        #        print self.buffer
        if self.buffer and self.buffer.strip() > "":
            if self.ul:
                li = Text(markup="• " + self.buffer,
                          font="FreeSerif",
                          font_size=10)
            elif self.ol:
                self.ref_counter += 1
                li = Text(markup=str(self.ref_counter) + ". " +
                          self.buffer.replace("↑", ""),
                          font="FreeSerif",
                          font_size=10)
            else:
                li = Text(markup=self.buffer, font="FreeSerif", font_size=10)
            self.pdf.add_text(li)
        self.buffer = None

    def start_a(self, attrs):
        self.a = True

    def end_a(self):
        self.a = False

    def start_table(self, attrs):
        for tups in attrs:
            if 'class' in tups:
                if tups[1] == 'wikitable':
                    self.table = True
                    self.wikitable = Table(border_width=1)
                    self.wikitable.cell_padding = [2, 2, 2, 2]

    def end_table(self):
        if self.table:
            self.table = False
            self.pdf.add_table(self.wikitable)

    def start_tr(self, attrs):
        self.tr = True
        self.row = Row(height=25)
        self.current_counter = 0

    def end_tr(self):
        self.tr = False
        if self.current_counter == self.column_counter:
            self.wikitable.add_row(self.row)

    def start_td(self, attrs):
        self.td = True
        self.buffer = ""

    def end_td(self):
        self.td = False
        #        print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer, font_size=10)
        cell_content.color = Color(0.0, 0.0, 0.0, 1.0)
        cell = Cell(cell_content, font_size=8, width=100)
        self.row.add_cell(cell)
        self.current_counter += 1
        self.buffer = None

    def start_th(self, attrs):
        self.th = True
        self.buffer = ""

    def end_th(self):
        self.th = False
        #       print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer, font_size=10)
        cell_content.color = Color(0.0, 0.0, 0.0, 1.0)
        cell = Cell(cell_content, font_size=8, width=100)
        self.row.add_cell(cell)
        self.column_counter += 1
        self.current_counter += 1
        self.buffer = None


#    def start_sup(self, attrs):
#        self.sup = True
#        self.buffer += "<sup>"
#
#    def end_sup(self):
#        print "test"
#        self.buffer += "</sup>"

    def start_ol(self, attrs):
        self.ol = True
        for tups in attrs:
            if 'class' in tups:
                if tups[1] == 'references':
                    self.reference = True

    def end_ol(self):
        self.ol = False
        self.ref_counter = 0
        if self.reference:
            self.reference = False
            #self.sup = False

    def start_ul(self, attrs):
        self.ul = True

    def end_ul(self):
        self.ul = False

    def start_span(self, attrs):
        self.span = True
        if self.buffer == None:
            self.buffer = ""

    def end_span(self):
        self.buffer += " "
        self.span = False

    def start_p(self, attrs):
        self.p = True
        self.buffer = ""

    def end_p(self):
        self.p = False
        if self.sup:
            para = Paragraph(
                markup=self.buffer,
                text=self.buffer,
                font="FreeSerif",
                font_size=10,
            )
            self.sup = False
        else:
            #print self.buffer
            para = Paragraph(
                text=self.buffer,
                font="FreeSerif",
                font_size=10,
            )

        para.set_justify(True)
        if self.language:
            para.language = self.language
        else:
            para.language = None

        para.set_hyphenate(True)
        self.pdf.add_paragraph(para)
        #        f= open("computer_para.txt","aw")
        #        f.write(self.buffer)
        #        f.write("\n")
        #        f.close()
        self.buffer = None

    def set_header(self, text):
        self.header = text

    def grab_image(self, imageurl, outputfolder):
        """
        Get the image from wiki
        """
        output_filename = None
        try:
            link = imageurl.strip()
            parts = link.split("/")
            filename = parts[len(parts) - 1]
            output_filename = os.path.join(outputfolder, filename)
            #output_filename=urllib.unquote(output_filename)
            print("GET IMAGE " + link + " ==> " + output_filename)
            if os.path.isfile(output_filename):
                print("File " + output_filename + " already exists")
                return output_filename
            opener = urllib2.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            infile = opener.open(link)
            page = infile.read()
            f = open(output_filename, "w")
            f.write(page)
            f.close()
        except KeyboardInterrupt:
            sys.exit()
        except urllib2.HTTPError:
            print("Error: Cound not download the image")
            pass
        return output_filename

    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(self.url)
        page = infile.read()
        page = cleanup(page)
        #        f= open("computer.txt","w")
        #        f.write(page)
        #        f.close()
        #        f = open("computer.txt","r")
        #        page=f.read()
        #        f.close()
        "Parse the given string 's'."
        self.feed(page)
        self.close()
        self.pdf.flush()
Esempio n. 3
0
import sys
sys.path.append("../src/")  #not good!
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__=="__main__":
    pdf = PDFWriter("tables.pdf",StandardPaper.A4)
    header = Header(text_align = pango.ALIGN_CENTER)
    #TODO Alignment not working.
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align = pango.ALIGN_CENTER)
    footer.set_text("test footer")
    #TODO Alignment not working.
    pdf.set_footer(footer)
    table = Table(border_width=1)
    table.cell_padding = [2, 2, 2, 2]
    row = Row(height=100)
    for i in range(4):
        cell_content = Text("SampleCell "+str(i),font_size=14)
        cell_content.color = Color(0.0,0.0,0.0,1.0)
        cell = Cell(cell_content, font_size=8,width=100)
        row.add_cell(cell)
    for i in range(4):
        table.add_row(row)
    pdf.add_table(table)
    pdf.flush()