Example #1
0
def pdf_to_text(path_in, path_out):
	if (conf.get_prop("use_pdf2txt") == '0'):
		try:
			from pdfminer.pdfdocument import PDFDocument
			from pdfminer.pdfparser import PDFParser
			from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
			from pdfminer.pdfdevice import PDFDevice, TagExtractor
			from pdfminer.pdfpage import PDFPage
			from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
			from pdfminer.cmapdb import CMapDB
			from pdfminer.layout import LAParams
			from pdfminer.image import ImageWriter	
		except:
			print ("system doesn't have PDFminer.six library installed. Try to use pdf2txt.")
			conf.set_prop("use_pdf2txt", "1")

	if (conf.get_prop("use_pdf2txt") == '0'):
		rsrcmgr = PDFResourceManager(caching=True)
		outfp = open(path_out, 'w')
		codec = 'utf-8'
		laparams = LAParams()
		imagewriter = None

		device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)

		fp = open(path_in, 'rb')
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		for page in PDFPage.get_pages(fp, set(), caching=True, check_extractable=True):
			interpreter.process_page(page)
		fp.close()
		device.close()
		outfp.close()
Example #2
0
def get_image_ids(swf_path):
	# gets the output of swftools
	swfextractOut = str(subprocess.check_output([conf.get_prop("swftools_path") +\
		"swfextract", swf_path]))

	# finds the string with the JPEG ids
	beginpoint = swfextractOut.find("JPEG")
	
	if (beginpoint is not -1):
		ids = []
		# finds start/end of the juicy bits 
		startpoint = swfextractOut.find(")", beginpoint) + 2
		endpoint = swfextractOut.find('\\n', beginpoint)
		idsStr = swfextractOut[startpoint:endpoint]

		# prints debug info
		print("starting point: " + str(startpoint))
		print("ending point: " + str(endpoint))
		print(swfextractOut[startpoint:endpoint])

		if (idsStr.find(",") == -1): return [int(idsStr)]
		else:
			for idStr in idsStr.split(","):
				ids.append(int(idStr))
			return ids

	return []
Example #3
0
def extract_images(swf_path, out_dir):
	# gets ids for images in SWF
	imageIds = get_image_ids(swf_path)
	
	# use swfextract to extract images based off id
	for imageId in imageIds:
		os.system(conf.get_prop("swftools_path") + "swfextract -j " + str(imageId) +\
			" -o " + out_dir + "/" + str(imageId) + ".jpg " + swf_path)
Example #4
0
def extract_audio(swf_path, out_path):
	os.system(conf.get_prop("swftools_path") + "swfextract -m -o " + out_path + " " + swf_path)
    if (len(sys.argv) == 1):
        print("0 = gets_swf.fetch_all")
        print("1 = gets_swf.extract_audio")
        print("2 = gets_swf.extract_audio_all")
        print("3 = gets_swf.convert_pdf")
        print("4 = conf.set_prop")
        print("5 = conf.get_prop")
        print("6 = conf.check_prop_exists")
        print("7 = gets_swf.gets_image_IDs")
        print("8 = extract_images")
        print("9 = extract_images_all")
        print("10 = gets_swf.get_linked_text")
        print("11 = gets_swf.get_linked_text_all")

    else:
        if (sys.argv[1] == "0"): gets_swf.fetch_all(sys.argv[2], sys.argv[3])
        if (sys.argv[1] == "1"):
            gets_swf.extract_audio(sys.argv[2], sys.argv[3])
        if (sys.argv[1] == "2"): gets_swf.extract_audio_all(sys.argv[2])
        if (sys.argv[1] == "3"): gets_swf.convert_pdf(sys.argv[2], sys.argv[3])
        if (sys.argv[1] == "4"): conf.set_prop(sys.argv[2], sys.argv[3])
        if (sys.argv[1] == "5"): print(conf.get_prop(sys.argv[2]))
        if (sys.argv[1] == "6"): print(conf.check_prop_exist(sys.argv[2]))
        if (sys.argv[1] == "7"): print(gets_swf.get_image_ids(sys.argv[2]))
        if (sys.argv[1] == "8"):
            gets_swf.extract_images(sys.argv[2], sys.argv[3])
        if (sys.argv[1] == "9"): gets_swf.extract_images_all(sys.argv[2])
        if (sys.argv[1] == "10"):
            print(gets_swf.get_linked_text(sys.argv[2], sys.argv[3]))
        if (sys.argv[1] == "11"): gets_swf.get_linked_text_all(sys.argv[2])
    if (not os.path.exists(arguments[1])):
        #print ("Could not find path " + arguments[1])
        #sys.exit(0)
        os.system("mkdir " + arguments[1])

    # the code expects paths to end in '/'s so append them if needed
    if (not arguments[0].endswith("/")): arguments[0] = arguments[0] + "/"
    if (not arguments[1].endswith("/")): arguments[1] = arguments[1] + "/"

    # set output_dir
    conf.set_prop("output_dir", arguments[1])

    kewUrl = arguments[0]

    print("STEP 1: Acquiring KEW slides from server")
    gets_swf.fetch_all(kewUrl, conf.get_prop("output_dir"))
    print("STEP 2: Fetching linked text")
    gets_swf.get_linked_text_all(conf.get_prop("output_dir"))
    print("STEP 3: Extracting audio")
    gets_swf.extract_audio_all(conf.get_prop("output_dir"))
    print("STEP 4: Extracting images")
    gets_swf.extract_images_all(conf.get_prop("output_dir"))
    gets_swf.clean_up_all(conf.get_prop("output_dir"))

    if (len(arguments) > 2):
        if (arguments[2] == "-z"):
            # remove '/' at end of string
            zip_dir = conf.get_prop("output_dir")
            if (zip_dir.endswith("/")): zip_dir = zip_dir[:-1]

            # run zip on output_dir and delete the original directory