Python analyseJS Examples

Programming Language: Python

Namespace/Package Name: lib.cuckoo.common.peepdf.JSAnalysis

Method/Function: analyseJS

Examples at hotexamples.com: 4

Python analyseJS - 4 examples found. These are the top rated real world Python examples of lib.cuckoo.common.peepdf.JSAnalysis.analyseJS extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: static.py Project: niterain/cuckoo-modified

    def _parse(self, filepath):
        """Parses the PDF for static information. Uses PyV8 from peepdf to
        extract JavaScript from PDF objects.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        pdf_json = PDFiD2JSON(pdf_data, True)
        pdfid_data = json.loads(pdf_json)[0]

        info = {}
        info["PDF Header"] = pdfid_data["pdfid"]["header"]
        info["Total Entropy"] = pdfid_data["pdfid"]["totalEntropy"]
        info["Entropy In Streams"] = pdfid_data["pdfid"]["streamEntropy"]
        info["Entropy Out Streams"] = pdfid_data["pdfid"]["nonStreamEntropy"]
        info["Count %% EOF"] = pdfid_data["pdfid"]["countEof"]
        info["Data After EOF"] = pdfid_data["pdfid"]["countChatAfterLastEof"]
        dates = pdfid_data["pdfid"]["dates"]["date"]

        # Get streams, counts and format.
        streams = {}
        for stream in pdfid_data["pdfid"]["keywords"]["keyword"]:
            streams[str(stream["name"])] = stream["count"]

        result = {}
        result["Info"] = info
        result["Dates"] = dates
        result["Streams"] = streams

        log.debug("About to parse with PDFParser")
        parser = PDFParser()
        ret, pdf = parser.parse(filepath, True, False)
        objects = []
        retobjects = []
        count = 0
        object_counter = 1

        for i in range(len(pdf.body)):
            body = pdf.body[count]
            objects = body.objects

            for index in objects:
                oid = objects[index].id
                offset = objects[index].offset
                size = objects[index].size
                details = objects[index].object

                obj_data = {}
                obj_data["Object ID"] = oid
                obj_data["Offset"] = offset
                obj_data["Size"] = size
                if details.type == "stream":
                    encoded_stream = details.encodedStream
                    decoded_stream = details.decodedStream
                    obj_data["File Type"] = _get_filetype(decoded_stream)[:100]
                    if HAVE_PYV8:
                        try:
                            jsdata = analyseJS(decoded_stream.strip())[0][0]
                        except Exception, e:
                            jsdata = "PyV8 failed to parse the stream."
                        if jsdata == None:
                            jsdata = "PyV8 did not detect JavaScript in the stream. (Possibly encrypted)"

                        # The following loop is required to "JSONify" the strings returned from PyV8.
                        # As PyV8 returns byte strings, we must parse out bytecode and
                        # replace it with an escape '\'. We can't use encode("string_escape")
                        # as this would mess up the new line representation which is used for
                        # beautifying the javascript code for Django's web interface.
                        ret_data = ""
                        for i in xrange(len(jsdata)):
                            if ord(jsdata[i]) > 127:
                                tmp = "\\x" + str(jsdata[i].encode("hex"))
                            else:
                                tmp = jsdata[i]
                            ret_data += tmp
                    else:
                        ret_data = "PyV8 not installed, unable to extract JavaScript."

                    obj_data["Data"] = ret_data
                    retobjects.append(obj_data)
                    object_counter += 1

                else:
                    obj_data["File Type"] = "Encoded"
                    obj_data["Data"] = "Encoded"
                    retobjects.append(obj_data)

            count += 1
            result["Objects"] = retobjects

Example #2

Show file

File: static.py Project: oi-buhtig/cuckoo-modified

    def _parse(self, filepath):
        """Parses the PDF for static information. Uses PyV8 from peepdf to
        extract JavaScript from PDF objects.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        pdf_json = PDFiD2JSON(pdf_data, True)
        pdfid_data = json.loads(pdf_json)[0]

        info = {}
        info["PDF Header"] = pdfid_data['pdfid']['header']
        info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy']
        info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy']
        info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy']
        info['Count %% EOF'] = pdfid_data['pdfid']['countEof']
        info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof']
        # Note, PDFiD doesn't interpret some dates properly, specifically it doesn't
        # seem to be able to properly represent time zones that involve fractions of
        # an hour
        dates = pdfid_data['pdfid']['dates']['date']

        # Get keywords, counts and format.
        keywords = {}
        for keyword in pdfid_data['pdfid']['keywords']['keyword']:
            keywords[str(keyword['name'])] = keyword['count']

        result = {}
        result["Info"] = info
        result["Dates"] = dates
        result["Keywords"] = keywords

        log.debug("About to parse with PDFParser")
        parser = PDFParser()
        ret, self.pdf = parser.parse(filepath,
                                     forceMode=True,
                                     looseMode=True,
                                     manualAnalysis=False)
        urlset = set()
        annoturiset = set()
        objects = []
        retobjects = []
        metadata = dict()

        self._set_base_uri()

        for i in range(len(self.pdf.body)):
            body = self.pdf.body[i]
            metatmp = self.pdf.getBasicMetadata(i)
            if metatmp:
                metadata = metatmp

            objects = body.objects

            for index in objects:
                oid = objects[index].id
                offset = objects[index].offset
                size = objects[index].size
                details = objects[index].object

                obj_data = {}
                obj_data["Object ID"] = oid
                obj_data["Offset"] = offset
                obj_data["Size"] = size
                if details.type == 'stream':
                    encoded_stream = details.encodedStream
                    decoded_stream = details.decodedStream
                    if HAVE_PYV8:
                        jsdata = None
                        try:
                            jslist, unescapedbytes, urlsfound, errors, ctxdummy = analyseJS(
                                decoded_stream.strip())
                            jsdata = jslist[0]
                        except Exception, e:
                            continue
                        if len(errors):
                            continue
                        if jsdata == None:
                            continue

                        for url in urlsfound:
                            urlset.add(url)

                        # The following loop is required to "JSONify" the strings returned from PyV8.
                        # As PyV8 returns byte strings, we must parse out bytecode and
                        # replace it with an escape '\'. We can't use encode("string_escape")
                        # as this would mess up the new line representation which is used for
                        # beautifying the javascript code for Django's web interface.
                        ret_data = ""
                        for x in xrange(len(jsdata)):
                            if ord(jsdata[x]) > 127:
                                tmp = "\\x" + str(jsdata[x].encode("hex"))
                            else:
                                tmp = jsdata[x]
                            ret_data += tmp
                    else:
                        continue

                    obj_data["Data"] = ret_data
                    retobjects.append(obj_data)
                elif details.type == "dictionary" and details.hasElement("/A"):
                    # verify it to be a link type annotation
                    subtype_elem = details.getElementByName("/Subtype")
                    type_elem = details.getElementByName("/Type")
                    if not subtype_elem or not type_elem:
                        continue
                    subtype_elem = self._get_obj_val(i, subtype_elem)
                    type_elem = self._get_obj_val(i, type_elem)
                    if subtype_elem.getValue(
                    ) != "/Link" or type_elem.getValue() != "/Annot":
                        continue
                    a_elem = details.getElementByName("/A")
                    a_elem = self._get_obj_val(i, a_elem)
                    if a_elem.type == "dictionary" and a_elem.hasElement(
                            "/URI"):
                        uri_elem = a_elem.getElementByName("/URI")
                        uri_elem = self._get_obj_val(i, uri_elem)
                        annoturiset.add(self.base_uri + uri_elem.getValue())
                else:
                    # can be dictionaries, arrays, etc, don't bother displaying them
                    # all for now
                    pass

Example #3

Show file

File: static.py Project: jbfuzier/cuckoo-modified

    def _parse(self, filepath):
        """Parses the PDF for static information. Uses PyV8 from peepdf to
        extract JavaScript from PDF objects.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        pdf_json = PDFiD2JSON(pdf_data, True)
        pdfid_data = json.loads(pdf_json)[0]

        info = {}
        info["PDF Header"] = pdfid_data['pdfid']['header']
        info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy']
        info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy']
        info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy']
        info['Count %% EOF'] = pdfid_data['pdfid']['countEof']
        info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof']
        # Note, PDFiD doesn't interpret some dates properly, specifically it doesn't
        # seem to be able to properly represent time zones that involve fractions of
        # an hour
        dates = pdfid_data['pdfid']['dates']['date']

        # Get keywords, counts and format.
        keywords = {}
        for keyword in pdfid_data['pdfid']['keywords']['keyword']:
            keywords[str(keyword['name'])] = keyword['count']

        result = {}
        result["Info"] = info
        result["Dates"] = dates
        result["Keywords"] = keywords

        log.debug("About to parse with PDFParser")
        parser = PDFParser()
        ret, pdf = parser.parse(filepath, True, False)
        urlset = set()
        objects = []
        retobjects = []
        metadata = dict()

        for i in range(len(pdf.body)):
            body = pdf.body[i]
            metatmp = pdf.getBasicMetadata(i)
            if metatmp:
                metadata = metatmp

            objects = body.objects

            for index in objects:
                oid = objects[index].id
                offset = objects[index].offset
                size = objects[index].size
                details = objects[index].object

                obj_data = {}
                obj_data["Object ID"] = oid
                obj_data["Offset"] = offset
                obj_data["Size"] = size
                if details.type == 'stream':
                    encoded_stream = details.encodedStream
                    decoded_stream = details.decodedStream
                    if HAVE_PYV8:
                        jsdata = None
                        try:
                            jslist, unescapedbytes, urlsfound, errors, ctxdummy = analyseJS(decoded_stream.strip())
                            jsdata = jslist[0]
                        except Exception,e:
                            continue
                        if len(errors):
                            continue
                        if jsdata == None:
                            continue

                        for url in urlsfound:
                            urlset.add(url)

                        # The following loop is required to "JSONify" the strings returned from PyV8.
                        # As PyV8 returns byte strings, we must parse out bytecode and
                        # replace it with an escape '\'. We can't use encode("string_escape")
                        # as this would mess up the new line representation which is used for
                        # beautifying the javascript code for Django's web interface.
                        ret_data = ""
                        for i in xrange(len(jsdata)):
                            if ord(jsdata[i]) > 127:
                                tmp = "\\x" + str(jsdata[i].encode("hex"))
                            else:
                                tmp = jsdata[i]
                            ret_data += tmp
                    else:
                        continue

                    obj_data["Data"] = ret_data
                    retobjects.append(obj_data)
                else:
                    # can be dictionaries, arrays, etc, don't bother displaying them
                    # all for now
                    pass
                    #obj_data["File Type"] = "Encoded"
                    #obj_data["Data"] = "Encoded"
                    #retobjects.append(obj_data)

            result["JSStreams"] = retobjects

Example #4

Show file

    def _parse(self, filepath):
        """Parses the PDF for static information. Uses PyV8 from peepdf to
        extract JavaScript from PDF objects.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        pdf_json = PDFiD2JSON(pdf_data, True)
        pdfid_data = json.loads(pdf_json)[0]

        info = {}
        info["PDF Header"] = pdfid_data['pdfid']['header']
        info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy']
        info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy']
        info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy']
        info['Count %% EOF'] = pdfid_data['pdfid']['countEof']
        info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof']
        dates = pdfid_data['pdfid']['dates']['date']

        # Get streams, counts and format.
        streams = {}
        for stream in pdfid_data['pdfid']['keywords']['keyword']:
            streams[str(stream['name'])] = stream['count']

        result = {}
        result["Info"] = info
        result["Dates"] = dates
        result["Streams"] = streams

        log.debug("About to parse with PDFParser")
        parser = PDFParser()
        ret, pdf = parser.parse(filepath, True, False)
        objects = []
        retobjects = []
        count = 0
        object_counter = 1

        for i in range(len(pdf.body)):
            body = pdf.body[count]
            objects = body.objects

            for index in objects:
                oid = objects[index].id
                offset = objects[index].offset
                size = objects[index].size
                details = objects[index].object

                obj_data = {}
                obj_data["Object ID"] = oid
                obj_data["Offset"] = offset
                obj_data["Size"] = size
                if details.type == 'stream':
                    encoded_stream = details.encodedStream
                    decoded_stream = details.decodedStream
                    obj_data["File Type"] = _get_filetype(decoded_stream)[:100]
                    if HAVE_PYV8:
                        try:
                            jsdata = analyseJS(decoded_stream.strip())[0][0]
                        except Exception, e:
                            jsdata = "PyV8 failed to parse the stream."
                        if jsdata == None:
                            jsdata = "PyV8 did not detect JavaScript in the stream. (Possibly encrypted)"

                        # The following loop is required to "JSONify" the strings returned from PyV8.
                        # As PyV8 returns byte strings, we must parse out bytecode and
                        # replace it with an escape '\'. We can't use encode("string_escape")
                        # as this would mess up the new line representation which is used for
                        # beautifying the javascript code for Django's web interface.
                        ret_data = ""
                        for i in xrange(len(jsdata)):
                            if ord(jsdata[i]) > 127:
                                tmp = "\\x" + str(jsdata[i].encode("hex"))
                            else:
                                tmp = jsdata[i]
                            ret_data += tmp
                    else:
                        ret_data = "PyV8 not installed, unable to extract JavaScript."

                    obj_data["Data"] = ret_data
                    retobjects.append(obj_data)
                    object_counter += 1

                else:
                    obj_data["File Type"] = "Encoded"
                    obj_data["Data"] = "Encoded"
                    retobjects.append(obj_data)

            count += 1
            result["Objects"] = retobjects