Esempio n. 1
0
from ReadPdfFromUrl import PdfFromUrl
from pdftotext import PdfToText
##define an instance of PdfFrom Url object
pdffromUrl = PdfFromUrl()
##define an instance of PdfToText object
p2t = PdfToText()
##open first pdf file and save it as file1.pdf
pdffromUrl.open_pdf_from_url(
    "http://www.seleniummaster.com/sitecontent/images/Selenium_Master_Test_Case_Base_Template.pdf",
    "file1.pdf")
##open second pdf file and save it as file2.pdf
pdffromUrl.open_pdf_from_url(
    "http://www.seleniummaster.com/sitecontent/images/Selenium_Master_TestCase_Modified_Template.pdf",
    "file2.pdf")
##get texts of file1.pdf
textOfFile1 = p2t.convert_pdf_to_txt("file1.pdf")
##get texts of file2.pdf
textOfFile2 = p2t.convert_pdf_to_txt("file2.pdf")
##get length of file1.pdf texts
lengthOfTextFile1 = len(textOfFile1)
##get length of file2.pdf texts
lengthOfTextFile2 = len(textOfFile2)
##print text length information
print "Length of text of File1", lengthOfTextFile1
print "Length of text of File2", lengthOfTextFile2
##compare text length
if (lengthOfTextFile1 == lengthOfTextFile2):
    print "Two pdf files' texts are the same"
else:
    print "Two pdf files' texts are different"
from ReadPdfFromUrl import PdfFromUrl
from pdftotext import PdfToText
import difflib
import os
#from difflib_data import *
#define an instance of PdfFrom Url object
pdffromUrl=PdfFromUrl()
##define an instance of PdfToText object
p2t=PdfToText()
##open first pdf file and save it as file1.pdf
pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_Test_Case_Base_Template.pdf","file1.pdf")
##open second pdf file and save it as file2.pdf
pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_TestCase_Modified_Template.pdf","file2.pdf")
##get texts of file1.pdf
textOfFile1=p2t.convert_pdf_to_txt("file1.pdf")
##get texts of file2.pdf
textOfFile2=p2t.convert_pdf_to_txt("file2.pdf")
##get length of file1.pdf texts
lengthOfTextFile1=len(textOfFile1)
##get length of file2.pdf texts
lengthOfTextFile2=len(textOfFile2)
##print text length information
print "Length of text of File1",lengthOfTextFile1
print "Length of text of File2",lengthOfTextFile2
##compare text length
if(lengthOfTextFile1==lengthOfTextFile2):
    print "Two pdf files' texts are the same"
else:
    print "Two pdf files' texts are different"
    d = difflib.HtmlDiff()
    with open("diff.html","w") as f:
Esempio n. 3
0
import md5
import pdfx
import time

(lfile,rfile) = sys.argv[1:]

timestr = time.strftime("%Y%m%d-%H%M%S")
lpdf = pdfx.PDFx(lfile)
lmetadata = lpdf.get_metadata()
print lmetadata

rpdf = pdfx.PDFx(rfile)
rmetadata = rpdf.get_metadata()
print rmetadata

p2t=PdfToText()

textOfFile1=p2t.convert_pdf_to_txt(lfile).decode('utf-8').encode('ascii','ignore')
textOfFile2=p2t.convert_pdf_to_txt(rfile).decode('utf-8').encode('ascii','ignore')
#print textOfFile1
#print textOfFile2
##open first pdf file and save it as file1.pdf
#pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_Test_Case_Base_Template.pdf","file1.pdf")
##open second pdf file and save it as file2.pdf
#pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_TestCase_Modified_Template.pdf","file2.pdf")
##get texts of file1.pdf
##get length of file1.pdf texts
#textOfFile1 = textOfFile1.decode('UTF-8','ignore')
#textOfFile1 = textOfFile1.encode('ascii','ignore')

#textOfFile2 = textOfFile2.decode('UTF-8','ignore')