Ejemplo n.º 1
0
def p_dictionary_entry_list(p):
    ''' dictionary_entry_list : dictionary_entry_list NAME object
                              |  '''
    if len(p) == 1:
        p[0]=[]
    else:
        key_node = create_leaf('name', p[2], span=p.lexspan(2))
        dictionary_span = (p.lexspan(2)[0],p.lexspan(3)[1])
        dictionary_node = create_tree('entry', [key_node,p[3]], span=dictionary_span)
        p[0] = p[1] + [dictionary_node]
Ejemplo n.º 2
0
def p_object_ref(p):
    ''' object : R '''
    p[0] = create_leaf('R', p[1], span=p.lexspan(1))
Ejemplo n.º 3
0
def p_object_null(p):
    ''' object : NULL '''                    
    p[0] = create_leaf('null', None, span=p.lexspan(1))
Ejemplo n.º 4
0
def p_object_false(p):
    ''' object : FALSE '''                    
    p[0] = create_leaf('bool', False, span=p.lexspan(1))
Ejemplo n.º 5
0
def p_object_true(p):
    ''' object : TRUE '''                    
    p[0] = create_leaf('bool', True, span=p.lexspan(1))
Ejemplo n.º 6
0
def p_object_number(p):
    ''' object : NUMBER '''
    x = p[1]
    x = float(int(float(x))) == float(x) and int(float(x)) or float(x)
    p[0] = create_leaf('number', x, span=p.lexspan(1))
Ejemplo n.º 7
0
def p_object_hexstring(p):
    ''' object : HEXSTRING '''                    
    p[0] = create_leaf('string', p[1], span=p.lexspan(1))
Ejemplo n.º 8
0
def p_object_name(p):
    ''' object : NAME '''
    p[0] = create_leaf('name', p[1], span=p.lexspan(1))
Ejemplo n.º 9
0
def bruteParser(pdf):
    '''
        This will try to parse any object in the file based on obj/endobj and few other kewords.
        This is an ad-hoc parsing wich will try to read the file in any posile way. 
        It may produce phantom overlaped XML objects. Yo may check this issues afterwards.
        Also it is slow.
    '''
    try:
        #Search for the PDF header
        headers = list(re.finditer(r'%PDF-1\.[0-7]',pdf))
        xml_headers = []
        for header in headers:
            start = header.start()
            end = header.end()
            version = header.group(0)[-3:]
            xml_headers.append(create_leaf('header', version,span=(start,end)))
        logger.info('Found %d headers'%len(xml_headers))
        
        #Search the startxref. And xrefs.
        startxrefs = list(re.finditer(r'startxref[\x20\r\n\t\x0c\x00]+[0-9]+[\x20\r\n\t\x0c\x00]+%%EOF',pdf))
        xrefs = list(re.finditer(r'xref',pdf))    
        xml_xrefs = []
        xml_pdf_ends = []
        for xref in xrefs:
            start = xref.start()
            for end in [x.end() for x in startxrefs if x.start()>xref.end()]:
                logger.info("Searching for a xref, trailer and %%%%EOF at [%s:%s]"%(start,end))
                potential_xref = pdf[start:end]
                try:
                    xml_xref, xml_pdf_end = parse('pdf_brute_end', potential_xref)
                    #fix lexspan and append
                    xml_xref.span_move(start)
                    xml_xrefs.append(xml_xref)
                    #fix lexspan and append
                    xml_pdf_end.span_move(start)
                    xml_pdf_ends.append(xml_pdf_end)
                except Exception, e:
                    print e
                    logger.info("Couldn't parse a xref, trailer and %%%%EOF at [%s:%s] (%s)"%(start,end,e))

        #use the force
        #This algorithm will try to match any obj with any endobj and will keep it 
        #if a sane object is found inside. Overlapping is possible here, you may analize it
        #cut it off from the xml later, using the lexspan markers.
        delimiter = r"[()<>\[\]/%\x20\r\n\t\x0c\x00]"
        objs = list(re.finditer(r'\d+\x20\d+\x20obj'+delimiter, pdf))
        endobjs = list(re.finditer(delimiter+r'endobj', pdf))
        streams = list(re.finditer(delimiter+'stream'+delimiter, pdf))
        endstreams = list(re.finditer('endstream'+delimiter+'endobj', pdf))
        xml_iobjects = []
        logger.info("Found %d Object starting points"%len(objs))
        logger.info("Found %d Object ending points"%len(endobjs))
        for m in objs:
            start = m.start()
            for end in [x.end() for x in endobjs if x.start()>m.end()]:
                try:
                    logger.debug("Parsing potential object at %s~%s"%(start,end))
                    potential_obj = pdf[start:end]
                    
                    '''
                    DISABLED
                    # If for some reason there are "endstreams" keywords inside the 
                    # stream let's momentaneaously escape them, so it can be parsed  
                    # with the strict parser
                    escape_endstreams = [e.start()+start for e in endstreams if e.start()>start and e.end()<end ]
                    for e in escape_endstreams[:-1]:
                        potential_obj = potential_obj[:e] +"X"*9 + potential_obj[e+9:]
                    '''
                    
                    #Try to strictly parse an indirect object
                    xml_iobject = parse('indirect',potential_obj)

                    #fix lexspan
                    xml_iobject.span_move(start)

                    '''
                    DISABLED
                    #FIX: fix escape
                    #WRONG offset!!!!!!!!!!!!
                    pl = payload(xml_iobject)
                    #Un-escape the "endstream" keywords
                    for e in escape_endstreams[:-1]:
                        pl = pl[:e] +"endstream" + pl[e+9:]
                    setpayload(xml_iobject, pl)
                    '''

                    #append to the list
                    xml_iobjects.append(xml_iobject)

                    #Just parse the first object we can of this try.
                    #Comment out the following line to search for phantoms 
                    #(i.e. objects inside objects or overlaped objects)
                    break
                except Exception,e:
                    logger.debug("Received exception %s when parsing potential object at [%s:%s]."%(e, start,end))
Ejemplo n.º 10
0
def p_pdf(p):
    ''' pdf : HEADER pdf_update_list'''
    header = create_leaf('header', p[1], span=p.lexspan(1))
    p[0] = create_tree('pdf', [header] + p[2], span=p.lexspan(0), version="OPAF!" )
Ejemplo n.º 11
0
def p_pdf_brute_end(p):
    ''' pdf_brute_end : XREF TRAILER  dictionary STARTXREF EOF'''
    xref = create_tree('xref', [p[3]],span=(0,p.lexspan(4)[0]-1), xref=p[1])
    pdf_end = create_leaf('startxref', p[4], span=(p.lexspan(4)[0],p.lexspan(0)[1]))
    p[0] = [xref, pdf_end] 
Ejemplo n.º 12
0
def p_pdf_end(p):
    ''' pdf_end : STARTXREF EOF'''
    p[0] = create_leaf('startxref', p[1], span=p.lexspan(0))
Ejemplo n.º 13
0
def p_xref_common(p):
    ''' xref : XREF TRAILER dictionary '''
    data = create_leaf('data', str(p[1]), span=p.lexspan(0))
    p[0] = create_tree('xref',[p[3], data], span=p.lexspan(0))
Ejemplo n.º 14
0
def p_indirect_object_stream(p):
    ''' indirect_object_stream : OBJ dictionary STREAM_DATA ENDOBJ '''
    stream_data = create_leaf('data',p[3],span=(p.lexspan(2)[0],p.lexspan(4)[1]))
    stream = create_tree('stream',[p[2], stream_data],span=p.lexspan(0))
    p[0] =  create_tree('indirect_object', [stream],span=p.lexspan(0), id="%d %d"%p[1])
Ejemplo n.º 15
0
                    xml_iobjects.append(xml_iobject)

                    #Just parse the first object we can of this try.
                    #Comment out the following line to search for phantoms 
                    #(i.e. objects inside objects or overlaped objects)
                    break
                except Exception,e:
                    logger.debug("Received exception %s when parsing potential object at [%s:%s]."%(e, start,end))
        logger.info("Succesfully parsed %d/%d Objects ending points"%(len(xml_iobjects),len(endobjs)*len(objs)))

        #sum all the objects
        allobjects = xml_headers + xml_xrefs + xml_pdf_ends + xml_iobjects

        if len(xml_pdf_ends) == 0:
            logger.info("%%%%EOF tag was not found! Creating a dummy.")
            dummy_startxref = create_leaf('startxref', -1, span=(len(pdf),len(pdf)))
            print dummy_startxref.value
            allobjects.append(dummy_startxref)

        if len(xml_headers) == 0:
            logger.info("%%%%PDF-N-M tag was not found! Creating a dummy.")
            allobjects.append(create_leaf('header', "NOVERSION", span=(0,0)))

        #Sort it as they appear in the file
        allobjects = sorted(allobjects,lambda x,y: cmp(x.span[0], y.span[0]))

        #recreate XML structure 'best' we can...
        assert allobjects[0].tag == 'header'
        root_element = create_tree('pdf', [allobjects.pop(0)], span=(0,len(pdf)), version="OPAF!(raw)")
        
        update = create_tree('pdf_update', [],span=(0xfffffff,-1))