def p_dictionary_entry_list(p): ''' dictionary_entry_list : dictionary_entry_list NAME object | ''' if len(p) == 1: p[0]=[] else: key_node = create_leaf('name', p[2], span=p.lexspan(2)) dictionary_span = (p.lexspan(2)[0],p.lexspan(3)[1]) dictionary_node = create_tree('entry', [key_node,p[3]], span=dictionary_span) p[0] = p[1] + [dictionary_node]
def p_object_ref(p): ''' object : R ''' p[0] = create_leaf('R', p[1], span=p.lexspan(1))
def p_object_null(p): ''' object : NULL ''' p[0] = create_leaf('null', None, span=p.lexspan(1))
def p_object_false(p): ''' object : FALSE ''' p[0] = create_leaf('bool', False, span=p.lexspan(1))
def p_object_true(p): ''' object : TRUE ''' p[0] = create_leaf('bool', True, span=p.lexspan(1))
def p_object_number(p): ''' object : NUMBER ''' x = p[1] x = float(int(float(x))) == float(x) and int(float(x)) or float(x) p[0] = create_leaf('number', x, span=p.lexspan(1))
def p_object_hexstring(p): ''' object : HEXSTRING ''' p[0] = create_leaf('string', p[1], span=p.lexspan(1))
def p_object_name(p): ''' object : NAME ''' p[0] = create_leaf('name', p[1], span=p.lexspan(1))
def bruteParser(pdf): ''' This will try to parse any object in the file based on obj/endobj and few other kewords. This is an ad-hoc parsing wich will try to read the file in any posile way. It may produce phantom overlaped XML objects. Yo may check this issues afterwards. Also it is slow. ''' try: #Search for the PDF header headers = list(re.finditer(r'%PDF-1\.[0-7]',pdf)) xml_headers = [] for header in headers: start = header.start() end = header.end() version = header.group(0)[-3:] xml_headers.append(create_leaf('header', version,span=(start,end))) logger.info('Found %d headers'%len(xml_headers)) #Search the startxref. And xrefs. startxrefs = list(re.finditer(r'startxref[\x20\r\n\t\x0c\x00]+[0-9]+[\x20\r\n\t\x0c\x00]+%%EOF',pdf)) xrefs = list(re.finditer(r'xref',pdf)) xml_xrefs = [] xml_pdf_ends = [] for xref in xrefs: start = xref.start() for end in [x.end() for x in startxrefs if x.start()>xref.end()]: logger.info("Searching for a xref, trailer and %%%%EOF at [%s:%s]"%(start,end)) potential_xref = pdf[start:end] try: xml_xref, xml_pdf_end = parse('pdf_brute_end', potential_xref) #fix lexspan and append xml_xref.span_move(start) xml_xrefs.append(xml_xref) #fix lexspan and append xml_pdf_end.span_move(start) xml_pdf_ends.append(xml_pdf_end) except Exception, e: print e logger.info("Couldn't parse a xref, trailer and %%%%EOF at [%s:%s] (%s)"%(start,end,e)) #use the force #This algorithm will try to match any obj with any endobj and will keep it #if a sane object is found inside. Overlapping is possible here, you may analize it #cut it off from the xml later, using the lexspan markers. delimiter = r"[()<>\[\]/%\x20\r\n\t\x0c\x00]" objs = list(re.finditer(r'\d+\x20\d+\x20obj'+delimiter, pdf)) endobjs = list(re.finditer(delimiter+r'endobj', pdf)) streams = list(re.finditer(delimiter+'stream'+delimiter, pdf)) endstreams = list(re.finditer('endstream'+delimiter+'endobj', pdf)) xml_iobjects = [] logger.info("Found %d Object starting points"%len(objs)) logger.info("Found %d Object ending points"%len(endobjs)) for m in objs: start = m.start() for end in [x.end() for x in endobjs if x.start()>m.end()]: try: logger.debug("Parsing potential object at %s~%s"%(start,end)) potential_obj = pdf[start:end] ''' DISABLED # If for some reason there are "endstreams" keywords inside the # stream let's momentaneaously escape them, so it can be parsed # with the strict parser escape_endstreams = [e.start()+start for e in endstreams if e.start()>start and e.end()<end ] for e in escape_endstreams[:-1]: potential_obj = potential_obj[:e] +"X"*9 + potential_obj[e+9:] ''' #Try to strictly parse an indirect object xml_iobject = parse('indirect',potential_obj) #fix lexspan xml_iobject.span_move(start) ''' DISABLED #FIX: fix escape #WRONG offset!!!!!!!!!!!! pl = payload(xml_iobject) #Un-escape the "endstream" keywords for e in escape_endstreams[:-1]: pl = pl[:e] +"endstream" + pl[e+9:] setpayload(xml_iobject, pl) ''' #append to the list xml_iobjects.append(xml_iobject) #Just parse the first object we can of this try. #Comment out the following line to search for phantoms #(i.e. objects inside objects or overlaped objects) break except Exception,e: logger.debug("Received exception %s when parsing potential object at [%s:%s]."%(e, start,end))
def p_pdf(p): ''' pdf : HEADER pdf_update_list''' header = create_leaf('header', p[1], span=p.lexspan(1)) p[0] = create_tree('pdf', [header] + p[2], span=p.lexspan(0), version="OPAF!" )
def p_pdf_brute_end(p): ''' pdf_brute_end : XREF TRAILER dictionary STARTXREF EOF''' xref = create_tree('xref', [p[3]],span=(0,p.lexspan(4)[0]-1), xref=p[1]) pdf_end = create_leaf('startxref', p[4], span=(p.lexspan(4)[0],p.lexspan(0)[1])) p[0] = [xref, pdf_end]
def p_pdf_end(p): ''' pdf_end : STARTXREF EOF''' p[0] = create_leaf('startxref', p[1], span=p.lexspan(0))
def p_xref_common(p): ''' xref : XREF TRAILER dictionary ''' data = create_leaf('data', str(p[1]), span=p.lexspan(0)) p[0] = create_tree('xref',[p[3], data], span=p.lexspan(0))
def p_indirect_object_stream(p): ''' indirect_object_stream : OBJ dictionary STREAM_DATA ENDOBJ ''' stream_data = create_leaf('data',p[3],span=(p.lexspan(2)[0],p.lexspan(4)[1])) stream = create_tree('stream',[p[2], stream_data],span=p.lexspan(0)) p[0] = create_tree('indirect_object', [stream],span=p.lexspan(0), id="%d %d"%p[1])
xml_iobjects.append(xml_iobject) #Just parse the first object we can of this try. #Comment out the following line to search for phantoms #(i.e. objects inside objects or overlaped objects) break except Exception,e: logger.debug("Received exception %s when parsing potential object at [%s:%s]."%(e, start,end)) logger.info("Succesfully parsed %d/%d Objects ending points"%(len(xml_iobjects),len(endobjs)*len(objs))) #sum all the objects allobjects = xml_headers + xml_xrefs + xml_pdf_ends + xml_iobjects if len(xml_pdf_ends) == 0: logger.info("%%%%EOF tag was not found! Creating a dummy.") dummy_startxref = create_leaf('startxref', -1, span=(len(pdf),len(pdf))) print dummy_startxref.value allobjects.append(dummy_startxref) if len(xml_headers) == 0: logger.info("%%%%PDF-N-M tag was not found! Creating a dummy.") allobjects.append(create_leaf('header', "NOVERSION", span=(0,0))) #Sort it as they appear in the file allobjects = sorted(allobjects,lambda x,y: cmp(x.span[0], y.span[0])) #recreate XML structure 'best' we can... assert allobjects[0].tag == 'header' root_element = create_tree('pdf', [allobjects.pop(0)], span=(0,len(pdf)), version="OPAF!(raw)") update = create_tree('pdf_update', [],span=(0xfffffff,-1))