Esempio n. 1
0
def evaluate_response(ret_dict):
    ''' this method comes usually after download_file,
        it will evaluate what has happened and if we even have some data to process
        or not
        params: data    - is the req object from the conducted request
        return: {}
        returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
        '''
    # extract data from ret_dict
    req = ret_dict['data']

    # get status code
    url = req.url
    status = req.status_code
    reason = req.reason

    # ahh everything is fine
    if status == 200:
        logger.info('download file, {0} {1} {2}'.format(url, reason, status))
        return {"code": True, "data": req, "error": False}

    # nah something is not like it should be
    else:
        logger.warning('download file, {0} {1} {2}'.format(
            url, reason, status))
        return {"code": False, "data": req, "error": True}
Esempio n. 2
0
def check_encryption(filename):
    ''' basic function to check if file is encrypted
	'''

    print(filename)
    try:
        fr = pdf.PdfFileReader(open(filename, "rb"))
        print(fr)
    except pdf.utils.PdfReadError as e:
        logger.warning('check encryption {0}'.format(e))
        return -1

    if fr.getIsEncrypted() == True:
        print('[i] File encrypted %s' % filename)
        nfr = decrypt_empty_pdf(filename)
        if nfr != -1:
            get_DocInfo(filename, nfr)
            get_xmp_meta_data(filename, nfr)

    else:
        get_DocInfo(filename, fr)
        get_xmp_meta_data(filename, fr)

    # fr.close()

    return True
Esempio n. 3
0
def _parse_pdf(filename):
    ''' the real parsing function '''

    logger.warning('{0}'.format(filename))
    if check_file_size(filename):
        ret = check_encryption(filename)
        return ret
    else:
        logger.warning('Filesize is 0 bytes at file: {0}'.format(filename))
        return False
Esempio n. 4
0
def decrypt_empty_pdf(filename):
    ''' this function simply tries to decrypt the pdf with the null password
		this does work, as long as no real password has been set
		if a complex password has been set -> john
	'''

    fr = pdf.PdfFileReader(open(filename, "rb"))
    try:
        fr.decrypt('')

    except NotImplementedError as e:
        logger.warning('decrypt_empty_pdf {0}{1}'.format(filename, e))
        return -1
    return fr
Esempio n. 5
0
def process_queue_data(filename, data, queue_type):
    ''' main function for processing gathered data
		i use this central function for it, so it is at *one* place
		and it is easy to change the data handling at a later step without
		deconstructing the who code
    '''
    ana_dict = {}
    url_dict = {}

    if queue_type == 'doc_info':
        logger.info('Queue DocInfo Data {0}'.format(filename))
        name = find_name(filename)
        path = filename

        # create a hash over the file path
        # hm, removed for now
        # path_hash = create_sha256(path)

        # order data in dict for analyse queue
        ana_dict = {path: {'filename': name, 'data': data}}
        #print('data:',data)
        #print('ana_dcit:',ana_dict)

        # add the data to queue
        add_queue(ana_q, ana_dict)

    elif queue_type == 'doc_xmp_info':
        logger.info('Queue DocXMPInfo Data {0}'.format(filename))
        logger.warning(
            'DocXMPInfo json processing not supported {0}'.format(filename))

    elif queue_type == 'url':
        # prepare queue entry
        logger.info('Url Queue {0}'.format(data))
        url_dict = {'url': data, 'filename': filename}
        sha256 = create_sha256(data)
        url_d[sha256] = url_dict

        # add dict to queue
        add_queue(url_q, url_dict)

    else:
        print('[-] Sorry, unknown queue. DEBUG!')
        logger.critical('Unknown queue')
        return False

    return True
Esempio n. 6
0
def seek_and_analyse(search, args, outdir):
    ''' function for keeping all the steps of searching for pdfs and analysing
        them together
    '''
    # check how many hits we got
    # seems like the method is broken in googlsearch library :(
    #code, hits = hits_google(search,args)
    #if code:
    #    print('Got {0} hits'.format(hits))

    # use the search function of googlesearch to get the results
    code, values = search_google(search, args)
    if not code:
        if values.code == 429:
            logger.warning(
                '[-] Too many requests, time to change ip address or use proxychains'
            )
        else:
            logger.warning('Google returned error {0}'.format(values))

        return -1

    for item in values:
        filename = find_name(item)
        process_queue_data(filename, item, 'url')

    # urls = search_pdf(search,args)

    # *if* we get an answer
    if url_q.empty() == False:
        # if urls != -1:
        # process through the list and get the pdfs
        while url_q.empty() == False:
            item = url_q.get()
            # print(item)
            url = item['url']
            rd_grabrun = grab_run(url, args, outdir)
            code = rd_grabrun['code']
            savepath = rd_grabrun['data']
            if code:
                _parse_pdf(savepath)

    return True
Esempio n. 7
0
def store_file(url, data, outdir):
    ''' storing the downloaded data to a file
        params: url     - is used to create the filename
                data    - the data of the file
                outdir  - to store in which directory
                returns: dict { "code":<code>, "data":<savepath>,"error":<error>} - the status code, the savepath, the errorcode
    '''

    logger.info('Store file {0}'.format(url))
    name = find_name(url)

    # only allow stored file a name with 50 chars
    if len(name) > 50:
        name = name[:49]

    # build up the save path
    save = "%s/%s" % (outdir, name)

    try:
        f = open(save, "wb")

    except OSError as e:
        logger.warning('store_file {0}'.format(e))
        # return ret_dict
        return {"code": False, "data": save, "error": e}

    # write the data and return the written bytes
    ret = f.write(data)

    # check if bytes are zero
    if ret == 0:
        logger.warning('Written {0} bytes for file: {1}'.format(ret, save))

    else:
        # log to info that bytes and file has been written
        logger.info('Written {0} bytes for file: {1}'.format(ret, save))

    # close file descriptor
    f.close()

    # return ret_dict
    return {"code": True, "data": save, "error": False}
Esempio n. 8
0
def download_file(url, args, header_data):
    ''' downloading the file for later analysis 
        params: url         - the url
                args        - argparse args namespace
                header_data - pre-defined header data
        returns: ret_dict
    '''

    # check the remote tls certificate or not?
    cert_check = args.cert_check

    # run our try catch routine
    try:
        # request the url and save the response in req
        # give header data and set verify as delivered by args.cert_check
        req = requests.get(url, headers=header_data, verify=cert_check)

    except requests.exceptions.SSLError as e:
        logger.warning('download file {0}{1}'.format(url, e))

        # return retdict
        return {"code": False, "data": req, "error": e}

    except requests.exceptions.InvalidSchema as e:
        logger.warning('download file {0}{1}'.format(url, e))

        # return retdict
        return {"code": False, "data": False, "error": e}

    except socket.gaierror as e:
        logger.warning('download file, host not known {0} {1}'.format(url, e))
        return {"code": False, "data": False, "error": e}

    except:
        logger.warning(
            'download file, something wrong with remote server? {0}'.format(
                url))
        # return retdict
        if not req in locals():
            req = False

        return {"code": False, "data": req, "error": True}

    #finally:
    # lets close the socket
    #req.close()

    # return retdict
    return {"code": True, "data": req, "error": False}
Esempio n. 9
0
def get_xmp_meta_data(filename, filehandle):
    ''' get the xmp meta data
    '''

    err_dict = {}
    real_extract = {}
    xmp_dict = {}

    fh = filehandle

    try:
        xmp_meta = fh.getXmpMetadata()

    except xml.parsers.expat.ExpatError as e:
        logger.warning('get_xmp_meta_data error {0}'.format(e))
        err_dict = {'error': str(e)}
        return -1

    finally:
        process_queue_data(filename, err_dict, 'doc_xmp_info')

    if xmp_meta != None:
        try:

            print('xmp_meta: {0} {1} {2} {3} {4} {5}'.format(
                xmp_meta.pdf_producer, xmp_meta.pdf_pdfversion,
                xmp_meta.dc_contributor, xmp_meta.dc_creator, xmp_meta.dc_date,
                xmp_meta.dc_subject))
        #print('xmp_meta cache: {0}'.format(xmp_meta.cache))
        #print('xmp_meta custom properties: {0}'.format(xmp_meta.custom_properties))
        #embed()
        except AttributeError as e:
            logger.warning('xmp_meta print {0}'.format(e))
            return False

    return xmp_dict
Esempio n. 10
0
def run(args):

    # initialize logger
    logger.info('{0} Started'.format(name))

    # create some variables

    # outfile name
    if args.outfile:
        out_filename = args.outfile
    else:
        out_filename = 'pdfgrab_analysis'

    # specify output directory
    outdir = args.outdir

    # create output directory
    make_directory(outdir)

    # lets see what the object is
    if args.url_single:
        url = args.url_single
        logger.info('Grabbing {0}'.format(url))
        logger.write_to_log('Grabbing %s' % (url))
        grab_url(url, args, outdir)

    elif args.file_single:
        pdffile = args.file_single
        logger.info('Parsing {0}'.format(pdffile))
        _parse_pdf(pdffile)

    elif args.search:
        search = args.search
        logger.info('Seek and analyse {0}'.format(search))
        if not seek_and_analyse(search, args, outdir):
            return -1

    elif args.files_dir:
        directory = args.files_dir
        logger.info('Analyse pdfs in directory {0}'.format(directory))
        try:
            files = os.listdir(directory)
        except:
            logger.warning('Error in args.files_dir')
            return False

        for f in files:
            # naive filter function, later usage of filemagic possible
            if f.find('.pdf') != -1:
                fpath = '%s/%s' % (directory, f)
                _parse_pdf(fpath)

    # simply generate html report from json outfile
    elif args.gen_html_report:
        fr = open(args.gen_html_report, 'r')
        analysis_dict = json.loads(fr.read())
        if create_html_report(analysis_dict, outdir, out_filename):
            logger.info('Successfully created html report')
            sys.exit(0)
        else:
            sys.exit(1)

    else:
        print('[-] Dunno what to do, bro. Use help. {0} -h'.format(
            sys.argv[0]))
        sys.exit(1)

    # creating the analysis dictionary for reporting
    analysis_dict = prepare_analysis_dict(ana_q)

    # lets go through the different reporting types
    if args.report_txt:
        if create_txt_report(analysis_dict, outdir, out_filename):
            logger.info('Successfully created txt report')

    if args.report_json:
        if create_json_report(analysis_dict, outdir, out_filename):
            logger.info('Successfully created json report')

    if args.report_html:
        if create_html_report(analysis_dict, outdir, out_filename):
            logger.info('Successfully created html report')

    if args.report_url_txt:
        if create_url_txt(url_d, outdir, out_filename):
            logger.info('Successfully created txt url report')

    if args.report_url_json:
        if create_url_json(url_d, outdir, out_filename):
            logger.info('Successfully created json url report')

    return 42
Esempio n. 11
0
def get_DocInfo(filename, filehandle):
    ''' the easy way to extract metadata
		
		indirectObjects...
		there is an interesting situation, some pdfs seem to have the same information stored 
		in different places, or things are overwritten or whatever
		this sometimes results in an extract output with indirect objects ... this is ugly

		{'/Title': IndirectObject(111, 0), '/Producer': IndirectObject(112, 0), '/Creator': IndirectObject(113, 0), '/CreationDate': IndirectObject(114, 0), '/ModDate': IndirectObject(114, 0), '/Keywords': IndirectObject(115, 0), '/AAPL:Keywords': IndirectObject(116, 0)}

		normally getObject() is the method to use, to fix this, however this was not working in the particular case.
		this thing might even bring up some more nasty things, as a (probably weak) defense and workaround
		the pdfobject is not used anymore after this function, data is converted to strings...
		bad example:
	'''

    err_dict = {}
    real_extract = {}

    fh = filehandle

    try:
        extract = fh.documentInfo

    except pdf.utils.PdfReadError as e:
        logger.warning('get_doc_info {0}'.format(e))
        err_dict = {'error': str(e)}
        return -1

    except PyPDF2.utils.PdfReadError as e:
        logger.warning('get_doc_info {0}'.format(e))
        err_dict = {'error': str(e)}
        return -1

    finally:
        process_queue_data(filename, err_dict, 'doc_info')

    print('-' * 80)
    print('File: %s' % filename)
    #	embed()
    # there are situations when documentinfo does not return anything
    # and extract is None
    if extract == None:
        err_dict = {'error': 'getDocumentInfo() returns None'}
        process_queue_data(filename, err_dict, 'doc_info')
        return -1

    try:
        for k in extract.keys():
            key = str(k)
            value = str(extract[k])
            edata = '%s %s' % (key, value)
            print(edata)
            print
            real_extract[key] = value
        print('-' * 80)

    except PyPDF2.utils.PdfReadError as e:
        logger.warning('get_doc_info {0}'.format(e))
        err_dict = {'error': str(e)}
        process_queue_data(filename, err_dict, 'doc_info')
        return -1

    process_queue_data(filename, real_extract, 'doc_info')