Ejemplo n.º 1
0
def dumppdf(outfp,
            fname,
            objids,
            pagenos,
            password='',
            dumpall=False,
            codec=None,
            extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    doc.is_extractable = True
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw', 'binary'):
        outfp.write('\n')
    return
Ejemplo n.º 2
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None,
                extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    doc.is_extractable = True
    pages = dict((page.pageid, pageno)
                 for (pageno, page) in enumerate(PDFPage.create_pages(doc)))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Ejemplo n.º 3
0
def pdf_to_txt(path):
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable():
        raise PDFPageAggregator
    else:
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(path[:-4] + '.txt', 'a') as f:
                        f.write(x.get_text().encode('utf-8') + '\n')
Ejemplo n.º 4
0
def extractembedded(outfp,
                    fname,
                    objids,
                    pagenos,
                    password='',
                    dumpall=False,
                    codec=None,
                    extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile'
                % (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print('extracting: %r' % path, file=sys.stderr)
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    doc.is_extractable = True
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
Ejemplo n.º 5
0
def main():
    global OUTFILE, VERBOSE, ENCODING

    printout(BANNER)

    args = parse_args()

    links = set()
    emails = set()
    usernames = set()
    ips = set()
    paths = set()
    softwares = set()
    locations = set()
    img_users = set()
    img_software = set()
    img_locations = set()
    img_serials = set()
    pdf_metadata = []
    img_metadata = []

    # get all input files
    if os.path.isfile(args.path):
        files = [args.path]
    elif os.path.isdir(args.path):
        files = [os.path.join(args.path, f) for f in os.listdir(args.path) if
                 os.path.isfile(os.path.join(args.path, f)) and f.endswith('.pdf')]
        printout('Files to be processed:', False)
        for h in files:
            printout(' %s' % os.path.join(args.path, h), False)
    else:
        printout('[!] Error: provided path %s is not a valid file or folder' % args.path)
        sys.exit(-1)

    # extract data from all files
    for f in files:
        with open(f, 'rb') as fp:

            try:

                if VERBOSE:
                    printout('* Processing file %s...' % f)
                else:
                    print(' ' * 200, end='\r')
                    print('* Processing file %s...' % f, end='\r')

                parser = PDFParser(fp)
                doc = PDFDocument(parser)
                if not doc.is_extractable:
                    printout('[!] Document %s is set not to be extractable. Trying anyway...' % f)
                    doc.is_extractable = True
                metadata = get_metadata(doc)
                metadata['_filename'] = f
                pdf_metadata.append(metadata)
                if args.email or args.links or args.ips or args.paths or args.usernames or args.software:
                    xml = get_xml(f)
                    decoded = html.unescape(xml)
                if args.email:
                    emails |= set(retrieve_all(decoded, rex.RE_EMAIL))
                if args.links:
                    links |= set(retrieve_all(decoded, rex.RE_WWW))
                    links |= set(urls_in_tags(decoded.splitlines()))
                if args.ips:
                    ips |= set(retrieve_all(decoded, rex.RE_IP))
                if args.extract_paths:
                    paths |= set(paths_in_tooltips(decoded.splitlines()))
                if args.usernames or args.software:
                    [u, s] = get_users_sw_from_meta(metadata)
                    usernames |= set(u)
                    softwares |= set(s)
                if args.images:
                    image_meta = extract_images(doc, store_path=args.store_images, filename=f)
                    img_metadata.append(image_meta)
                    [img_u, img_sw, img_ser, img_loc] = get_users_sw_from_img_meta(image_meta)
                    img_users |= set(img_u)
                    img_software |= set(img_sw)
                    img_locations |= set(img_loc)
                    img_serials |= set(img_ser)
            except Exception as ex:
                printout('[!] Error while processing file %s: %s' % (f, ex))
                printout()
                printout(ex, False)

    # now we also retrieve info from the paths structure found
    [u_linux, u_mac, u_windows] = get_info_from_paths(paths)
    usernames |= set(u_linux)
    usernames |= set(u_mac)
    usernames |= set(u_windows)

    # if images were extracted and metadata to be shown, first show img metadata
    if args.metadata and args.images:
        printout('%s %s %s' % ('.' * 31, 'image metadata', '.' * 31))
        printout()
        print_image_metadata(img_metadata)
    # show pdf metadata
    if args.metadata:
        printout('%s %s %s' % ('.' * 32, 'PDF metadata', '.' * 32))
        printout()
        print_metadata(pdf_metadata)

    # print the summary of results
    if args.summary: printout('.' * 78 + '\n')
    if args.usernames: print_results('* Usernames found', usernames)
    if args.paths: print_results('* Paths found', paths)
    if args.ips: print_results('* IPs found', ips)
    if args.email: print_results('* Emails found', emails)
    if args.links: print_results('* Links found', links)
    if args.software: print_results('* Software found', softwares)
    if args.images:
        if img_users and args.usernames: print_results('* Users in images', img_users)
        if img_software and args.software: print_results('* Software in images', img_software)
        if img_locations: print_results('* GPS Locations', img_locations)
        if img_serials: print_results('* Serial # in images', img_serials)