def extract_permissions(file):
    a = APK(file)
    d = DalvikVMFormat(a.get_dex())
    dx = VMAnalysis(d)
    vm = dvm.DalvikVMFormat(a.get_dex())
    vmx = analysis.uVMAnalysis(vm)
    d.set_vmanalysis(dx)
    d.set_decompiler(DecompilerDAD(d, dx))
    return a.get_permissions() 
Beispiel #2
0
    def __analyze_dex(self, dex_file, raw=False):
        # DalvikVMFormat
        dalvik_vm_format = None
        if raw == False:
            dalvik_vm_format = DalvikVMFormat(open(dex_file, "rb").read())
        else:
            dalvik_vm_format = DalvikVMFormat(dex_file)

        # VMAnalysis
        vm_analysis = VMAnalysis(dalvik_vm_format)
        dalvik_vm_format.set_vmanalysis(vm_analysis)

        return vm_analysis
    def __init__(self, args):

        self.apk = args.apk
        self.verbosity = args.verbosity
        self.output_location = args.output_location
        self.file_identifier = args.apk.split('.')[0]
        self.file_identifier = self.file_identifier[-24:]

        # print "Analyzing " + self.apk
        # print " Output Location " + self.output_location
        # print "File Identifier " + self.file_identifier

        # analyze the dex file
        print "From LOCATION = ", self.apk
        self.a = APK(self.apk)

        # get the vm analysis
        self.d = DalvikVMFormat(self.a.get_dex())
        self.dx = VMAnalysis(self.d)
        self.gx = GVMAnalysis(self.dx, None)

        self.d.set_vmanalysis(self.dx)
        self.d.set_gvmanalysis(self.gx)

        # create the cross reference
        self.d.create_xref()
        self.d.create_dref()

        print 'CWD: ', os.getcwd()
        predictor = Predict_Input(self.output_location, self.file_identifier)
        self.predictions = predictor.predict(self.apk, self.apk[:-4],
                                             self.output_location,
                                             self.file_identifier)

        try:
            # get the classes for this apk
            # store them in a dict
            self.classes = self.get_class_dict()

            # Find the R$layout class
            self.Rlayout = self.get_RLayout(self.d.get_classes())

            # Find the R$id class
            self.Rid = self.get_Rid(self.d.get_classes())

            # Store all fields referenced in R$id
            self.fields, self.field_refs = self.get_fields(self.Rid)
        except Exception, e:
            print e
Beispiel #4
0
    def __init__(self, apk_name):
        self.apk_name = apk_name
        self.apk = INPUT_APK_DIR + self.apk_name + ".apk"

        # analyze the dex file
        self.a = APK(self.apk)

        # get the vm analysis
        self.d = DalvikVMFormat(self.a.get_dex())
        self.dx = VMAnalysis(self.d)
        self.gx = GVMAnalysis(self.dx, None)

        self.d.set_vmanalysis(self.dx)
        self.d.set_gvmanalysis(self.gx)

        # create the cross reference
        self.d.create_xref()
        self.d.create_dref()
Beispiel #5
0
    def process_vm(self):
        """
        Process the application's classes.dex

        Args:
            None

        Results:
            None
        """
        # Make sure classes.dex exists
        if self.find_dex():
            self.dex = self.apk.get_dex()
            # Analyze classes.dex
            # TODO Throw in a progress bar, this can take awhile
            if self.dex:
                self.logger.log("info", "Loading classes.dex ...")
                from androguard.core.bytecodes.dvm import DalvikVMFormat
                from androguard.core.analysis.analysis import VMAnalysis
                from androguard.core.analysis.ganalysis import GVMAnalysis
                # Create a new virtual machine instance
                self.vm = DalvikVMFormat(self.dex)
                if self.vm:
                    print(self.t.yellow("\n\t--> Loaded classes.dex (!)\n"))
                    self.logger.log("info", "Analyzing classes.dex ...")
                    # Analyze the virtual machine instance
                    self.vmx = VMAnalysis(self.vm)
                    self.gmx = GVMAnalysis(self.vmx, None)
                    if self.vmx and self.gmx:
                        print(self.t.yellow("\n\t--> Analyzed classes.dex (!)\n"))
                        self.vm.set_vmanalysis(self.vmx)
                        self.vm.set_gvmanalysis(self.gmx)
                        # Generate xref(s)
                        self.vm.create_xref()
                        self.vm.create_dref()
                    else:
                        CommandError("Cannot analyze VM instance (!)")
                else:
                    CommandError("Cannot load VM instance (!)")
        else:
            CommandError("classes.dex not found (!)")
Beispiel #6
0
    def __init__(self, args):
        self.apk = args.apk
        self.verbosity = args.verbosity

        print "Analyzing " + self.apk

        # analyze the dex file
        self.a = APK(self.apk)

        # get the vm analysis
        self.d = DalvikVMFormat(self.a.get_dex())
        self.dx = VMAnalysis(self.d)
        self.gx = GVMAnalysis(self.dx, None)

        self.d.set_vmanalysis(self.dx)
        self.d.set_gvmanalysis(self.gx)

        # create the cross reference
        self.d.create_xref()
        self.d.create_dref()

        try:
            # get the classes for this apk
            # store them in a dict
            self.classes = self.get_class_dict()

            # Find the R$layout class
            self.Rlayout = self.get_RLayout(self.d.get_classes())

            # Find the R$id class
            self.Rid = self.get_Rid(self.d.get_classes())

            # Store all fields referenced in R$id
            self.fields, self.field_refs = self.get_fields(self.Rid)
        except Exception, e:
            print e
Beispiel #7
0
    def process_vm(self, apk=False, dex=False):
        """
        Process the application's classes.dex

        Args:
            param1 = boolean
            param2 = boolean

        Results:
            None
        """
        try:
            if apk:
                # Make sure the APK contains a classes.dex file
                if self.find_dex():
                    self.dex = self.apk.get_dex()
                    if self.dex:
                        self.logger.log("info", "Loading classes.dex ...")
                        from androguard.core.bytecodes.dvm import DalvikVMFormat
                        from androguard.core.analysis.analysis import VMAnalysis
                        from androguard.core.analysis.ganalysis import GVMAnalysis
                        # Create a DalvikVMFormat instance ...
                        # In this case self.dex will be a file type
                        self.vm = DalvikVMFormat(self.dex)
                        if self.vm:
                            print(self.t.yellow("\n\t--> Loaded classes.dex (!)\n"))
                            self.logger.log("info", "Analyzing classes.dex ...")
                            # Analyze the DalvikVMFormat instance and return
                            # analysis instances of VMAnalysis and GVMAnalysis
                            self.vmx = VMAnalysis(self.vm)
                            self.gmx = GVMAnalysis(self.vmx, None)
                            if self.vmx and self.gmx:
                                print(self.t.yellow("\n\t--> Analyzed classes.dex (!)\n"))
                                # Set the analysis properties on the
                                # DalvikVMFormat instance
                                self.vm.set_vmanalysis(self.vmx)
                                self.vm.set_gvmanalysis(self.gmx)
                                # Generate xref(s) and dref(s)
                                self.vm.create_xref()
                                self.vm.create_dref()
                                return
                            else:
                                CommandError("process_vm : Cannot analyze VM instance (!)")
                                return
                        else:
                            CommandError("process_vm : Cannot load VM instance (!)")
                            return
                    else:
                        CommandError("process_vm : classes.dex not found (!)")
                        return
            if dex:
                if self.dex:
                    from androguard.core.bytecodes.dvm import DalvikVMFormat
                    from androguard.core.analysis.analysis import VMAnalysis
                    from androguard.core.analysis.ganalysis import GVMAnalysis
                    # Analyze the DalvikVMFormat instance and return
                    # analysis instances of VMAnalysis and GVMAnalysis
                    self.vm = DalvikVMFormat(self.util.read(self.dex))
                    if self.vm:
                        print(self.t.yellow("\n\t--> Loaded {} (!)\n"
                                            .format(self.dex
                                                    .split("/")[-1])))
                        self.logger.log("info", "Analyzing {} ..."
                                        .format(self.dex
                                                .split("/")[-1]))
                        # Set the analysis properties on the
                        # DalvikVMFormat instance
                        self.vmx = VMAnalysis(self.vm)
                        self.gmx = GVMAnalysis(self.vmx, None)
                        if self.vmx and self.gmx:
                            print(self.t.yellow("\n\t--> Analyzed {} (!)\n"
                                                .format(self.dex
                                                        .split("/")[-1])))
                            # Set the analysis properties on the
                            # DalvikVMFormat instance
                            self.vm.set_vmanalysis(self.vmx)
                            self.vm.set_gvmanalysis(self.gmx)
                            # Generate xref(s) and dref(s)
                            self.vm.create_xref()
                            self.vm.create_dref()
                            return
                        else:
                            CommandError("process_vm :" +
                                         "Cannot analyze VM instance (!)")
                            return
                    else:
                        CommandError("process_vm :" +
                                     "Cannot load VM instance (!)")
                        return
                else:
                    CommandError("process_vm : classes.dex not found (!)")
                    return
        except Exception as e:
            CommandError("process_vm : {}".format(e))
Beispiel #8
0
def extract_features(file_path):
    result = {}
    try:
        a = APK(file_path)
        d = DalvikVMFormat(a.get_dex())
        dx = VMAnalysis(d)
        vm = dvm.DalvikVMFormat(a.get_dex())
        vmx = analysis.uVMAnalysis(vm)
        d.set_vmanalysis(dx)
        d.set_decompiler(DecompilerDAD(d, dx))
    except:
        return None

    result['android_version_code'] = a.get_androidversion_code()
    result['android_version_name'] = a.get_androidversion_name()
    result['max_sdk'] = a.get_max_sdk_version()
    result['min_sdk'] = a.get_min_sdk_version()
    result['libraries'] = a.get_libraries()
    result['filename'] = a.get_filename()
    result['target_sdk'] = a.get_target_sdk_version()
    result['md5'] = hashlib.md5(a.get_raw()).hexdigest()
    result['sha256'] = hashlib.sha256(a.get_raw()).hexdigest()
    result['permissions'] = a.get_permissions()
    result['activities'] = a.get_activities()
    result['providers'] = a.get_providers()
    result['services'] = a.get_services()
    result['strings'] = d.get_strings()
    result['class_names'] = [c.get_name() for c in d.get_classes()]
    result['method_names'] = [m.get_name() for m in d.get_methods()]
    result['field_names'] = [f.get_name() for f in d.get_fields()]
    result['is_native_code'] = 1 if analysis.is_native_code(dx) else 0
    result['is_obfuscation'] = 1 if analysis.is_ascii_obfuscation(d) else 0
    result['is_crypto_code'] = 1 if analysis.is_crypto_code(dx) else 0
    result['is_dyn_code'] = 1 if analysis.is_dyn_code(dx) else 0
    result['is_reflection_code'] = 1 if analysis.is_reflection_code(vmx) else 0
    result['is_database'] = 1 if d.get_regex_strings(DB_REGEX) else 0

    s_list = []
    s_list.extend(result['class_names'])
    s_list.extend(result['method_names'])
    s_list.extend(result['field_names'])
    result['entropy_rate'] = entropy_rate(s_list)

    result['feature_vectors'] = {}

    # Search for the presence of api calls in a given apk
    result['feature_vectors']['api_calls'] = []
    for call in API_CALLS:
        status = 1 if dx.tainted_packages.search_methods(".", call, ".") else 0
        result['feature_vectors']['api_calls'].append(status)

    # Search for the presence of permissions in a given apk
    result['feature_vectors']['permissions'] = []
    for permission in PERMISSIONS:
        status = 1 if permission in result['permissions'] else 0
        result['feature_vectors']['permissions'].append(status)

    result['feature_vectors']['special_strings'] = []
    for word in SPECIAL_STRINGS:
        status = 1 if d.get_regex_strings(word) else 0
        result['feature_vectors']['special_strings'].append(status)

    return result
Beispiel #9
0
from androguard import *
from androguard.core.bytecodes import apk
from androguard.core.bytecodes import dvm
from androguard.core.analysis.analysis import VMAnalysis

if __name__ == '__main__':
	path = "crackme02.apk"
	a = apk.APK(path)
	d = dvm.DalvikVMFormat(a.get_dex())
	x = VMAnalysis(d)

	for method in d.get_methods():
		g = x.get_method(method)

		if method.get_code() == None:
			continue

		print method.get_class_name(), method.get_name(), method.get_descriptor()

		idx = 0
		for i in g.get_basic_blocks().get():
			print "\t %s %x %x" % (i.name, i.start, i.end), '[ NEXT = ', ', '.join( "%x-%x-%s" % (j[0], j[1], j[2].get_name()) for j in i.get_next() ), ']', '[ PREV = ', ', '.join( j[2].get_name() for j in i.get_prev() ), ']'
			for ins in i.get_instructions():
				print "\t\t %x" % idx, ins.get_name(), ins.get_output()
			idx += ins.get_length()

		print ""
 def __init_androguard_objects(self, apk_file):
     self._a = apk.APK(apk_file)
     self._d = dvm.DalvikVMFormat(self._a.get_dex())
     self._dx = VMAnalysis(self._d)
     self._cm = self._d.get_class_manager()
     self._strings = self._d.get_strings()
class StaticAPKAnalyzer():

    # performs static analysis on given apk file
    def __init__(self, output_format=None):
        self._apk_data = dict()
        self._a = None
        self._d = None
        self._dx = None
        self._cm = None
        self._strings = None

        # set output parameters
        categories = [
            'files', 'features', 'intent_filters', 'activities',
            'req_permissions', 'used_permissions', 'api_calls', 'crypto_calls',
            'net_calls', 'telephony_calls', 'suspicious_calls',
            'dynamic_calls', 'native_calls', 'reflection_calls', 'urls',
            'providers', 'receivers', 'services', 'libraries'
        ]

        self._out = {
            'format': output_format,
            'feat_len': 80,
            'categories': categories
        }

    def analyze(self, apk_file):
        self._apk_data = dict()
        self.__init_androguard_objects(apk_file)
        self.__extract_features(apk_file)

    def set_max_output_feat_len(self, feat_len):

        # set maximal length of feature strings
        self._out['feat_len'] = feat_len

    def set_output_categories(self, categories):

        # specify feature categories that should be printed, by default, all extracted features are written to output.
        self._out['categories'] = categories

    def __init_androguard_objects(self, apk_file):
        self._a = apk.APK(apk_file)
        self._d = dvm.DalvikVMFormat(self._a.get_dex())
        self._dx = VMAnalysis(self._d)
        self._cm = self._d.get_class_manager()
        self._strings = self._d.get_strings()

    def __extract_features(self, apk_file):
        self.__calc_hashes(apk_file)
        self.__extract_apk_obj_features()

        # extract features from vm analysis object
        used_perms_dict = self._dx.get_permissions([])
        self._apk_data['used_permissions'] = used_perms_dict.keys()

        for paths in used_perms_dict.values():
            self.__extract_dx_features('api_calls', paths)

        paths = self._dx.tainted_packages.search_crypto_packages()
        self.__extract_dx_features('crypto_calls', paths)
        paths = self._dx.tainted_packages.search_net_packages()
        self.__extract_dx_features('net_calls', paths)
        paths = self._dx.tainted_packages.search_telephony_packages()
        self.__extract_dx_features('telephony_calls', paths)
        paths = self._dx.get_tainted_packages().search_methods(
            "Ldalvik/system/DexClassLoader;", ".", ".")
        self.__extract_dx_features('dynamic_calls', paths)
        paths = self._dx.get_tainted_packages().search_methods(
            "Ljava/lang/reflect/Method;", ".", ".")
        self.__extract_dx_features('reflection_calls', paths)

        self.__extract_native_calls()
        self.__extract_urls()
        self.__extract_suspicious_calls()

    def __calc_hashes(self, apk_file):
        self._apk_data['md5'] = get_file_hash('md5', apk_file)
        self._apk_data['sha256'] = get_file_hash('sha256', apk_file)

    def __extract_apk_obj_features(self):
        self._apk_data['apk_name'] = str(basename(self._a.get_filename()))
        self._apk_data['package_name'] = str(self._a.get_package())
        self._apk_data['sdk_version'] = str(self._a.get_min_sdk_version())
        self._apk_data['features'] = self._a.get_elements(
            'uses-feature', 'android:name')
        self._apk_data['files'] = self._a.get_files()
        self._apk_data['activities'] = self._a.get_activities()
        self._apk_data['providers'] = self._a.get_providers()
        self._apk_data['req_permissions'] = self._a.get_permissions()
        self._apk_data['receivers'] = self._a.get_receivers()
        self._apk_data['services'] = self._a.get_services()
        self._apk_data['libraries'] = self._a.get_libraries()
        self._apk_data['intent_filters'] = self._a.get_elements(
            'action', 'android:name') + self._a.get_elements(
                'category', 'android:name')

    def __extract_dx_features(self, category, paths):
        self._apk_data[category] = dict()
        for path in paths:
            class_name = path.get_dst(self._cm)[0]
            method_name = path.get_dst(self._cm)[1]
            if method_name.find('init') > 0:
                method_name = 'init'
            method_name = class_name[1:] + '->' + method_name
            self._apk_data[category][method_name] = 1

    def __extract_native_calls(self):
        self._apk_data['native_calls'] = dict()
        for method in self._d.get_methods():

            # this condition is copied from show_NativeCalls()
            if method.get_access_flags() & 0x100:
                class_name = method.get_class_name()
                method_name = method.get_name()
                if method_name.find('init') > 0:
                    method_name = 'init'
                method_name = class_name[1:] + '->' + method_name
                self._apk_data['native_calls'][method_name] = 1

    def __extract_urls(self):

        # get urls
        ip_regex = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})'
        url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|\
                    (?:%[0-9a-fA-F][0-9a-fA-F]))+'

        self._apk_data['urls'] = dict()

        for string in self._strings:
            # search for ip addresses
            ip = re.search(ip_regex, string)
            if None != ip:
                ip = ip.group()
                self._apk_data['urls'][ip] = 1

            # search for urls
            url = re.search(url_regex, string)
            if None != url:
                url = urllib.quote(url.group(), '>:/?')
                self._apk_data['urls'][url] = 1
                # add hostname
                o = urlparse(url)
                hostname = o.netloc
                self._apk_data['urls'][hostname] = 1

    def __extract_suspicious_calls(self):
        sus_calls = [
            'Ljava/net/HttpURLconnection;->setRequestMethod',
            'Ljava/net/HttpURLconnection', 'getExternalStorageDirectory',
            'getSimCountryIso', 'execHttpRequest', 'sendTextMessage',
            'Lorg/apache/http/client/methods/HttpPost', 'getSubscriberId',
            'Landroid/telephony/SmsMessage;->getMessageBody', 'getDeviceId',
            'getPackageInfo', 'getSystemService', 'getWifiState',
            'system/bin/su', 'system/xbin/su', 'setWifiEnabled',
            'setWifiDisabled', 'Cipher',
            'Ljava/io/IOException;->printStackTrace', 'android/os/Exec',
            'Ljava/lang/Runtime;->exec'
        ]

        sus_calls = dict(zip(sus_calls, np.ones(len(sus_calls))))
        self._apk_data['suspicious_calls'] = dict()

        for string in self._strings:
            for sc in sus_calls:
                if string.find(sc) >= 0:
                    self._apk_data['suspicious_calls'][string] = 1

        sus_tuples = [('java/net/HttpURLconnection', 'setRequestMethod'),
                      ('android/telephony/SmsMessage', 'getMessageBody'),
                      ('java/io/IOException', 'printStackTrace'),
                      ('java/lang/Runtime', 'exec')]

        for tpl in sus_tuples:
            class_name = tpl[0][1:]
            name = tpl[1]
            paths = self._dx.tainted_packages.search_methods(
                class_name, name, '')
            for path in paths:
                method = path.get_dst(self._cm)
                method_full = method[0] + '->' + method[1]
                self._apk_data['suspicious_calls'][method_full] = 1

    def __str__(self):
        if self._out['format'] == 'xml':
            out_str = self.__create_xml_string()
        else:
            out_str = self.__get_feature_strings()
        return out_str

    def __get_feature_strings(self):
        feat_str = ''
        for category in self._out['categories']:
            if category not in self._apk_data:
                continue

            for item in self._apk_data[category]:
                feat_str += '\n{0}::{1}'\
                    .format(category, item[:self._out['feat_len']])
        return feat_str[1:]

    def __create_xml_string(self):
        xml_str = '<static>'
        xml_str += self.__get_info_string()
        for category in self._out['categories']:
            xml_str += self.__get_category_string(category)
        xml_str += '\n</static>'

        doc = parseString("" + xml_str + "")
        xml = doc.toxml().replace('<static>', '\n<static>')
        return xml

    def __get_info_string(self):
        istr = '\n\t<info>'
        istr += '\n\t\t<sha256>' + str(self._apk_data['sha256']) + '</sha256>'
        istr += '\n\t\t<md5>' + str(self._apk_data['md5']) + '</md5>'
        istr += '\n\t\t<apk_name>' + self._apk_data['apk_name'] + '</apk_name>'
        istr += '\n\t\t<package_name>' + self._apk_data[
            'package_name'] + '</package_name>'
        istr += '\n\t\t<sdk_version>' + self._apk_data[
            'sdk_version'] + '</sdk_version>'
        istr += '\n\t</info>'
        return istr

    def __get_category_string(self, category):
        cat_str = '\n\t<{}>'.format(category)
        for item in self._apk_data[category]:
            field = self.__get_field_name(category)
            cat_str += '\n\t\t<{0}>{1}</{0}>'\
                .format(field, item[:self._out['feat_len']])
        cat_str += '\n\t</{}>'.format(category)
        return cat_str

    @staticmethod
    def __get_field_name(category):
        if category.endswith('ies'):
            return category[:-3] + 'y'
        else:
            return category[:-1]
Beispiel #12
0
from androguard.core.analysis.analysis import VMAnalysis
from androguard.core.bytecodes.apk import APK
from androguard.core.bytecodes.dvm import DalvikVMFormat
from core.analysis import *
if __name__ == '__main__':
    a = APK("1_1.apk")
    print len(a.get_activities())
    print a.get_main_activity()
    d = DalvikVMFormat(a.get_dex())
    dx = VMAnalysis(d)
    print dx.get_method_signature()
Beispiel #13
0
def extract_features(file_path):
    result = {}
    try:
        a = APK(file_path)
        d = DalvikVMFormat(a.get_dex())
        dx = VMAnalysis(d)
        vm = dvm.DalvikVMFormat(a.get_dex())
        vmx = analysis.uVMAnalysis(vm)
        d.set_vmanalysis(dx)
        d.set_decompiler(DecompilerDAD(d, dx))
    except:
        return None

    result['android_version_code'] = a.get_androidversion_code()
    result['android_version_name'] = a.get_androidversion_name()
    result['max_sdk'] = a.get_max_sdk_version()
    result['min_sdk'] = a.get_min_sdk_version()
    result['libraries'] = a.get_libraries()
    result['filename'] = a.get_filename()
    result['target_sdk'] = a.get_target_sdk_version()
    result['md5'] = hashlib.md5(a.get_raw()).hexdigest()
    result['sha256'] = hashlib.sha256(a.get_raw()).hexdigest()
    result['permissions'] = a.get_permissions()
    result['activities'] = a.get_activities()
    result['providers'] = a.get_providers()
    result['services'] = a.get_services()
    #result['strings'] = d.get_strings()
    #result['class_names'] = [c.get_name() for c in d.get_classes()]
    #result['method_names'] = [m.get_name() for m in d.get_methods()]
    #result['field_names'] = [f.get_name() for f in d.get_fields()]
    class_names = [c.get_name() for c in d.get_classes()]
    method_names = [m.get_name() for m in d.get_methods()]
    field_names = [ f.get_name() for f in d.get_fields()]

    result['is_native_code'] = 1 if analysis.is_native_code(dx) else 0
    result['is_obfuscation'] = 1 if analysis.is_ascii_obfuscation(d) else 0
    result['is_crypto_code'] = 1 if analysis.is_crypto_code(dx) else 0
    result['is_dyn_code'] = 1 if analysis.is_dyn_code(dx) else 0
    result['is_reflection_code'] = 1 if analysis.is_reflection_code(vmx) else 0
    result['is_database'] = 1 if d.get_regex_strings(DB_REGEX) else 0

    s_list = []
    #s_list.extend(result['class_names'])
    #s_list.extend(result['method_names'])
    #s_list.extend(result['field_names'])
    s_list.extend(class_names)
    s_list.extend(method_names)
    s_list.extend(method_names)
    result['entropy_rate'] = entropy_rate(s_list)

    result['feature_vectors'] = {}

    # Search for the presence of api calls in a given apk
    result['feature_vectors']['api_calls'] = []
    for call in API_CALLS:
        status = 1 if dx.tainted_packages.search_methods(".", call, ".") else 0
        result['feature_vectors']['api_calls'].append(status)

    # Search for the presence of permissions in a given apk        
    result['feature_vectors']['permissions'] = []
    for permission in PERMISSIONS:
        status = 1 if permission in result['permissions'] else 0
        result['feature_vectors']['permissions'].append(status)

    result['feature_vectors']['special_strings'] = []
    for word in SPECIAL_STRINGS:
        status = 1 if d.get_regex_strings(word) else 0
        result['feature_vectors']['special_strings'].append(status)

    opt_seq = []
    for m in d.get_methods():
        for i in m.get_instructions():
            opt_seq.append(i.get_name())

    optngramlist = [tuple(opt_seq[i:i+NGRAM]) for i in xrange(len(opt_seq) - NGRAM)]
    optngram = Counter(optngramlist)
    optcodes = dict()
    tmpCodes = dict(optngram)
    #for k,v in optngram.iteritems():
    #    if v>=NGRAM_THRE:
            #optcodes[str(k)] = v
    #        optcodes[str(k)] = 1
    tmpCodes = sorted(tmpCodes.items(),key =lambda d:d[1],reverse=True) 
    for value in tmpCodes[:NGRAM_THRE]:
        optcodes[str(value[0])] = 1
    result['feature_vectors']['opt_codes'] = optcodes

    return result
 def __init_androguard_objects(self, apk_file):
     self._a = apk.APK(apk_file)
     self._d = dvm.DalvikVMFormat(self._a.get_dex())
     self._dx = VMAnalysis(self._d)
     self._cm = self._d.get_class_manager()
     self._strings = self._d.get_strings()
class StaticAPKAnalyzer():
    
    # performs static analysis on given apk file
    def __init__(self, output_format=None):
        self._apk_data = dict()
        self._a = None
        self._d = None
        self._dx = None
        self._cm = None
        self._strings = None

        # set output parameters
        categories = ['files',
                      'features',
                      'intent_filters',
                      'activities',
                      'req_permissions',
                      'used_permissions',
                      'api_calls',
                      'crypto_calls',
                      'net_calls',
                      'telephony_calls',
                      'suspicious_calls',
                      'dynamic_calls',
                      'native_calls',
                      'reflection_calls',
                      'urls',
                      'providers',
                      'receivers',
                      'services',
                      'libraries']

        self._out = {'format': output_format,
                     'feat_len': 80,
                     'categories': categories}


    def analyze(self, apk_file):
        self._apk_data = dict()
        self.__init_androguard_objects(apk_file)
        self.__extract_features(apk_file)


    def set_max_output_feat_len(self, feat_len):

        # set maximal length of feature strings
        self._out['feat_len'] = feat_len


    def set_output_categories(self, categories):

        # specify feature categories that should be printed, by default, all extracted features are written to output.
        self._out['categories'] = categories


    def __init_androguard_objects(self, apk_file):
        self._a = apk.APK(apk_file)
        self._d = dvm.DalvikVMFormat(self._a.get_dex())
        self._dx = VMAnalysis(self._d)
        self._cm = self._d.get_class_manager()
        self._strings = self._d.get_strings()


    def __extract_features(self, apk_file):
        self.__calc_hashes(apk_file)
        self.__extract_apk_obj_features()

        # extract features from vm analysis object
        used_perms_dict = self._dx.get_permissions([])
        self._apk_data['used_permissions'] = used_perms_dict.keys()

        for paths in used_perms_dict.values():
            self.__extract_dx_features('api_calls', paths)
        
        paths = self._dx.tainted_packages.search_crypto_packages()
        self.__extract_dx_features('crypto_calls', paths)
        paths = self._dx.tainted_packages.search_net_packages()
        self.__extract_dx_features('net_calls', paths)
        paths = self._dx.tainted_packages.search_telephony_packages()
        self.__extract_dx_features('telephony_calls', paths)
        paths = self._dx.get_tainted_packages().search_methods("Ldalvik/system/DexClassLoader;", ".", ".")
        self.__extract_dx_features('dynamic_calls', paths)
        paths = self._dx.get_tainted_packages().search_methods("Ljava/lang/reflect/Method;", ".", ".")
        self.__extract_dx_features('reflection_calls', paths)

        self.__extract_native_calls()
        self.__extract_urls()
        self.__extract_suspicious_calls()


    def __calc_hashes(self, apk_file):
        self._apk_data['md5'] = get_file_hash('md5', apk_file)
        self._apk_data['sha256'] = get_file_hash('sha256', apk_file)


    def __extract_apk_obj_features(self):
        self._apk_data['apk_name'] = str(basename(self._a.get_filename()))
        self._apk_data['package_name'] = str(self._a.get_package())
        self._apk_data['sdk_version'] = str(self._a.get_min_sdk_version())
        self._apk_data['features'] = self._a.get_elements('uses-feature', 'android:name')
        self._apk_data['files'] = self._a.get_files()
        self._apk_data['activities'] = self._a.get_activities()
        self._apk_data['providers'] = self._a.get_providers()
        self._apk_data['req_permissions'] = self._a.get_permissions()
        self._apk_data['receivers'] = self._a.get_receivers()
        self._apk_data['services'] = self._a.get_services()
        self._apk_data['libraries'] = self._a.get_libraries()
        self._apk_data['intent_filters'] = self._a.get_elements('action', 'android:name') + self._a.get_elements('category', 'android:name')


    def __extract_dx_features(self, category, paths):
        self._apk_data[category] = dict()
        for path in paths:
            class_name = path.get_dst(self._cm)[0]
            method_name = path.get_dst(self._cm)[1]
            if method_name.find('init') > 0:
                method_name = 'init'
            method_name = class_name[1:] + '->' + method_name
            self._apk_data[category][method_name] = 1


    def __extract_native_calls(self):
        self._apk_data['native_calls'] = dict()
        for method in self._d.get_methods():

            # this condition is copied from show_NativeCalls()
            if method.get_access_flags() & 0x100:
                class_name = method.get_class_name()
                method_name = method.get_name()
                if method_name.find('init') > 0:
                    method_name = 'init'
                method_name = class_name[1:] + '->' + method_name
                self._apk_data['native_calls'][method_name] = 1


    def __extract_urls(self):

        # get urls
        ip_regex = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})'
        url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|\
                    (?:%[0-9a-fA-F][0-9a-fA-F]))+'

        self._apk_data['urls'] = dict()

        for string in self._strings:
            # search for ip addresses
            ip = re.search(ip_regex, string)
            if None != ip:
                ip = ip.group()
                self._apk_data['urls'][ip] = 1

            # search for urls	
            url = re.search(url_regex, string)
            if None != url:
                url = urllib.quote(url.group(), '>:/?')
                self._apk_data['urls'][url] = 1
                # add hostname
                o = urlparse(url)
                hostname = o.netloc
                self._apk_data['urls'][hostname] = 1


    def __extract_suspicious_calls(self):
        sus_calls = ['Ljava/net/HttpURLconnection;->setRequestMethod',
                     'Ljava/net/HttpURLconnection',
                     'getExternalStorageDirectory',
                     'getSimCountryIso',
                     'execHttpRequest',
                     'sendTextMessage',
                     'Lorg/apache/http/client/methods/HttpPost',
                     'getSubscriberId',
                     'Landroid/telephony/SmsMessage;->getMessageBody',
                     'getDeviceId',
                     'getPackageInfo',
                     'getSystemService',
                     'getWifiState',
                     'system/bin/su',
                     'system/xbin/su',
                     'setWifiEnabled',
                     'setWifiDisabled',
                     'Cipher',
                     'Ljava/io/IOException;->printStackTrace',
                     'android/os/Exec',
                     'Ljava/lang/Runtime;->exec']

        sus_calls = dict(zip(sus_calls, np.ones(len(sus_calls))))
        self._apk_data['suspicious_calls'] = dict()

        for string in self._strings:
            for sc in sus_calls:
                if string.find(sc) >= 0:
                    self._apk_data['suspicious_calls'][string] = 1

        sus_tuples = [('java/net/HttpURLconnection', 'setRequestMethod'),
                      ('android/telephony/SmsMessage', 'getMessageBody'),
                      ('java/io/IOException', 'printStackTrace'),
                      ('java/lang/Runtime', 'exec')]

        for tpl in sus_tuples:
            class_name = tpl[0][1:]
            name = tpl[1]
            paths = self._dx.tainted_packages.search_methods(class_name, name, '')
            for path in paths:
                method = path.get_dst(self._cm)
                method_full = method[0] + '->' + method[1]
                self._apk_data['suspicious_calls'][method_full] = 1


    def __str__(self):
        if self._out['format'] == 'xml':
            out_str = self.__create_xml_string()
        else:
            out_str = self.__get_feature_strings()
        return out_str


    def __get_feature_strings(self):
        feat_str = ''
        for category in self._out['categories']:
            if category not in self._apk_data:
                continue

            for item in self._apk_data[category]:
                feat_str += '\n{0}::{1}'\
                    .format(category, item[:self._out['feat_len']])
        return feat_str[1:]


    def __create_xml_string(self):
        xml_str = '<static>'
        xml_str += self.__get_info_string()
        for category in self._out['categories']:
            xml_str += self.__get_category_string(category)
        xml_str += '\n</static>'

        doc = parseString("" + xml_str + "")
        xml = doc.toxml().replace('<static>', '\n<static>')
        return xml


    def __get_info_string(self):
        istr = '\n\t<info>'
        istr += '\n\t\t<sha256>' + str(self._apk_data['sha256']) + '</sha256>'
        istr += '\n\t\t<md5>' + str(self._apk_data['md5']) + '</md5>'
        istr += '\n\t\t<apk_name>' + self._apk_data['apk_name'] + '</apk_name>'
        istr += '\n\t\t<package_name>' + self._apk_data['package_name'] + '</package_name>'
        istr += '\n\t\t<sdk_version>' + self._apk_data['sdk_version'] + '</sdk_version>'
        istr += '\n\t</info>'
        return istr


    def __get_category_string(self, category):
        cat_str = '\n\t<{}>'.format(category)
        for item in self._apk_data[category]:
            field = self.__get_field_name(category)
            cat_str += '\n\t\t<{0}>{1}</{0}>'\
                .format(field, item[:self._out['feat_len']])
        cat_str += '\n\t</{}>'.format(category)
        return cat_str


    @staticmethod
    def __get_field_name(category):
        if category.endswith('ies'):
            return category[:-3] + 'y'
        else:
            return category[:-1]