def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document pdf = parse_content(content) # get a list of all object ids xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf for objid in objids: # get an object by id obj = pdf.getobj(objid) if hasattr(obj, "attrs"): # watermarks tend to be in FlateDecode elements if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode": #length = obj.attrs["Length"] #rawdata = copy(obj.rawdata) data = copy(obj.get_data()) phrase= "Authorized licensed use limited to: " if phrase in str(data): if verbose >= 2: sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data[data.index(phrase):data.index(phrase)+1000])) elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) for objid in evil_ids: content = remove_object_by_id(content, objid) return content
def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document pdf = parse_content(content) # get a list of all object ids xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf for objid in objids: # get an object by id obj = pdf.getobj(objid) if hasattr(obj, "attrs"): if ("Width" in obj.attrs) and str(obj.attrs["Width"]) == "432": if "Height" in obj.attrs and str(obj.attrs["Height"]) == "230": evil_ids.append(objid) if len(evil_ids) > 1: raise Exception("too many ads detected on the page, please double check?") for objid in evil_ids: content = remove_object_by_id(content, objid) return content
def test_remove_object_by_id(self): content = "" output = remove_object_by_id(content, 1) self.assertEqual(content, output) content = "" output = remove_object_by_id(content, 2) self.assertEqual(content, output) content = "" output = remove_object_by_id(content, 100) self.assertEqual(content, output) content = "1 0 obj\nthings\nendobj\nleftovers" output = remove_object_by_id(content, 2) self.assertEqual(content, output) content = "1 0 obj\nthings\nendobj\nleftovers" output = remove_object_by_id(content, 1) self.assertEqual("leftovers", output)
def scrub(cls, content, verbose=0): evil_ids = [] # parse the pdf into a pdfminer document pdf = parse_content(content) # get a list of all object ids xref = pdf.xrefs[0] objids = xref.get_objids() # check each object in the pdf for objid in objids: # get an object by id obj = pdf.getobj(objid) if hasattr(obj, "attrs"): # watermarks tend to be in FlateDecode elements if "Filter" in obj.attrs and str(obj.attrs["Filter"]) == "/FlateDecode": length = obj.attrs["Length"] # the watermark is never very long if length < 1000: #rawdata = copy(obj.rawdata) data = copy(obj.get_data()) phrase="Redistribution subject to AIP license or copyright" if phrase in str(data): if verbose >= 2: sys.stderr.write("%s: Found object %s with %r: %r; omitting..." % (cls.__name__, objid, phrase, data)) elif verbose >= 1: sys.stderr.write("%s: Found object %s with %r; omitting..." % (cls.__name__, objid, phrase,)) evil_ids.append(objid) for objid in evil_ids: content = remove_object_by_id(content, objid) return content