Example #1
0
def dump_indexed_strings_freq_distribution():
    try:
        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        # iterate over all keys
        # for key in redis.keys():
        for key in redis.scan_iter():

            # check for interruption
            if signal.caught():
                logger.error("Interrupted")
                return None

            # zset datatype for hashed strings
            if redis.type(key) == "zset":
                logger.info("%s,%d,", binascii.hexlify(key), redis.zcard(key))

    except Exception as e:
        logger.error("Error dumping freq distribution of indexed strings %s",
                     str(e))
Example #2
0
def dump_repo_signatures():
    try:
        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        # iterate over all keys
        for key in redis.keys():

            # check for interruption
            if signal.caught():
                logger.error("Interrupted")
                return None

            # zset datatype for hashed strings
            if redis.type(key) == "zset":
                logger.info("%s,%s,", binascii.hexlify(key), redis.get(key))
            # elif redis.type(key) == "hset":
            else:
                print type(redis.type(key)), len(str(redis.type(key)))
                logger.info("%s -> %s,", key, redis.hgetall(key))

    except Exception as e:
        logger.error("Error dumping indexed strings %s", str(e))
Example #3
0
def dump_indexed_strings():
    try:
        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        # iterate over all keys
        mapping = {}
        revmapping = {}
        roots = []
        for key in redis.keys():

            # check for interruption
            if signal.caught():
                logger.error("Interrupted")
                return None

            # zset datatype for hashed strings
            if redis.type(key) == "zset":
                key = binascii.hexlify(key)
                values = redis.zrange(key, 0, -1, withscores=False)
                logger.info("%s,%s,", key, values)

            elif redis.type(key) == "set":
                values = redis.smembers(key)
                logger.info("%s -> %s", key, values)

            elif redis.type(key) == "hash" and '_' in key:
                values = redis.hgetall(key)
                logger.info("%s -> %s", key, values)

            elif (redis.type(key) == "hash" and '-' in key
                  and key.split('-', 1)[0]
                  in ['str', 'func', 'file', 'dir', 'branch', 'repo']):
                values = redis.hgetall(key)
                logger.info("%s -> %s", key, values)
                from common import skip_set
                for h, c in values.items():
                    # skip special purpose item
                    if h in skip_set or h == key:
                        continue
                    if len(str(h)) < 10:
                        roots.append(h)
                    if not key in mapping:
                        mapping[key] = []
                    mapping[key].append(h)
                    if not h in revmapping:
                        revmapping[h] = []
                    if not key in revmapping[h]:
                        revmapping[h].append(key)

            elif not redis.type(key) == "hash" and len(str(key)) < 15:
                values = redis.get(key)
                logger.info("%s -> %s", key, values)

            # else:
            #    logger.info("%s type %s", key, redis.type(key))

        if revmapping and mapping:
            for root in roots:
                dump_tree(root, 0, mapping, revmapping)

    except Exception as e:
        logger.error("Error dumping indexed strings %s", str(e))
Example #4
0
def dump_indexed_unique_strings_repo_distribution():
    try:
        import matplotlib.pyplot as plt
        import numpy as np

        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        repo_all_strs = dict()
        repo_unq_strs = dict()

        # iterate over all keys
        for key in redis.keys():

            # check for interruption
            if signal.caught():
                logger.error("Interrupted")
                return None

            # zset datatype for hashed strings
            if redis.type(key) == "zset":
                for repo_id in redis.zrange(key, 0, -1, withscores=False):
                    if repo_id in repo_all_strs:
                        repo_all_strs[repo_id] += 1
                    else:
                        repo_all_strs[repo_id] = 1

                    if redis.zcard(key) == 1:
                        if repo_id in repo_unq_strs:
                            repo_unq_strs[repo_id] += 1
                        else:
                            repo_unq_strs[repo_id] = 1

        # format: repo_id, num_strs, num_uniq_strs, ratio_uniq_all_strs
        for repo_id, count in repo_all_strs.iteritems():
            try:
                ratio = float(repo_unq_strs[repo_id]) / count
                logger.info("%s,%d,%d,%0.2f", repo_id, count, \
                            repo_unq_strs[repo_id], ratio)
                repo_all_strs[repo_id] = ratio

            except Exception as e:
                logger.error("%s,%s", repo_id, str(e))

        plt.hist(repo_all_strs.values(),
                 bins=50)  # np.logspace(1, 1000000, 100))
        plt.gca().set_xscale('log')
        plt.title("Unique/total strings across all repos")
        plt.xlabel('# Strings')
        plt.ylabel('# Repos')
        # pyplot.grid(True)
        plt.legend()
        plt.savefig('strings_hist', format='pdf')

    except ImportError as ie:
        logger.error("Error importing required modules: %s", str(e))

    except Exception as e:
        logger.error("Error dumping stats of indexed strings per repo %s",
                     str(e))
Example #5
0
def run_counter(main, argv):
    global logger, stats_logger
    logger = main.logger
    stats_logger = main.stats_logger

    if not len(argv) == 1:
        logger.error('expects args: $feature_csv_list, but get: %s', argv)
        exit(1)

    input_path = argv[0]
    if not os.path.exists(input_path):
        logger.error("%s does not exist", input_path)
        exit(1)

    redis_rrc = main.rrc
    if not redis_rrc or not redis_rrc.handle():
        logger.error("redis rrc not available, exiting!")
        exit(1)

    input_list = get_input_list(main=main,
                                input_list_file=input_path,
                                skip_input_callback=skip_input)
    # deduplicate!
    input_list = list(set(input_list))

    # start extracting
    if input_list:
        # track progress
        count = len(input_list)
        logger.info("Counting %d feature files", count)

        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        pb = utils.Progressbar("Counting features: ", count)
        pb.start()

        if main.QUEUING and main.QUEUING == "Celery":
            from celery import group
            from celery_tasks import feature_counter_worker

            # group jobs
            job = group(
                feature_counter_worker.s(infile) for infile in input_list)
            result = job.apply_async()

            # track worker progress
            completed = 0
            while (result.waiting()):
                completed += result.completed_count()
                if completed < count:
                    pb.update(completed)
                time.sleep(2)

        else:  # non-parallel instance
            count = 0
            # scan loop
            for infile in input_list:
                # check for interruption
                if signal.caught():
                    break
                if count_features(main, infile):
                    count += 1
                # update progressbar
                pb.update(count)

            if not signal.caught():
                pb.finish()
Example #6
0
def run_searcher(main, argv):
    global logger, stats_logger
    logger = main.logger
    stats_logger = main.stats_logger
    searching.logger = main.logger
    searching.stats_logger = main.stats_logger
    searching_java.logger = main.logger
    searching_java.stats_logger = main.stats_logger

    if len(argv) != 2:
        logger.error('expects two args')
        exit(1)

    # if we are just testing this repo
    if argv[0] == 'dump':
        main.TEST_REPO = True

    # check if redis is populated
    ndbsize, ndbval = main.nrc.dbsize()
    jdbsize, jdbval = main.jrc.dbsize()
    rdbsize, rdbval = main.rrc.dbsize()
    if ndbsize == 0 or jdbsize == 0:
        logger.error("Nothing is indexed in native or java redis db (ndbsize: %s, jdbsize: %s, rdbsize: %s)! Exiting.",
                     ndbsize, jdbsize, rdbsize)
        exit(1)

    # check if path exists
    input_path = argv[1]
    if not os.path.exists(input_path):
        logger.error('%s does not exist', input_path)
        exit(1)

    apk_list = get_input_list(main=main, redis=main.rrc, redis_pipe=main.rrc.pipeline(), input_path=input_path,
                              input_type="apk", skip_scanned=True, skip_failure=True)

    print ("There are %d input to be searched" % len(apk_list))
    # start searching
    if apk_list:

        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        # track progress
        count = len(apk_list)
        logger.info("Searching %d applications", count)
        pb = utils.Progressbar('Matching libs: ', count)
        pb.start()

        # if requested parallelism
        if main.QUEUING and main.QUEUING == "Celery":
            from celery import group
            from celery_tasks import search_apk_worker

            # group jobs
            job = group(search_apk_worker.s(app_path) for app_path in apk_list)
            result = job.apply_async()

            # track worker progress
            completed = 0
            while result.waiting():
                time.sleep(5)
                completed += result.completed_count()
                if completed < count:
                    pb.update(completed)

            # all done
            pb.finish()
            result.get()

        else:  # non-parallel instance

            # search loop
            count = 0
            for app_path in apk_list:

                # check for interruption
                if signal.caught():
                    break

                # lookup apk
                search_apk(main, app_path)

                # update progressbar
                count += 1
                pb.update(count)

            # all done
            if not signal.caught() and pb:
                pb.finish()

    else:
        logger.error("No apk(s) to search")
Example #7
0
def run_signature(main, argv):
    global logger, stats_logger
    logger = main.logger
    stats_logger = main.stats_logger

    # the outer args
    if len(argv) != 2:
        logger.error('expects two args')
        exit(1)
    if argv[0] == 'dump':
        main.TEST_REPO = True

    # the inner args
    argv = argv[1]
    if len(argv) < 1 or len(argv) > 2:
        logger.error('expects args: $input_path [$input_type] [-d]')
        exit(1)

    input_path = argv[0]
    input_type = argv[1] if len(argv) == 2 else 'jar'
    if not os.path.exists(input_path):
        logger.error('%s does not exist', input_path)
        exit(1)

    input_list = get_input_list(main=main,
                                redis=None,
                                redis_pipe=None,
                                input_path=input_path,
                                input_type=input_type,
                                path_as_id=True,
                                skip_scanned=False,
                                skip_signatured=True,
                                skip_failure=True)
    print("There are %d input to be signatured" % len(input_list))
    # start signature
    # query the database
    if input_list:
        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        # track progress
        count = len(input_list)
        logger.info("Matching %d libraries/applications", count)

        # if requested parallelism
        if main.QUEUING and main.QUEUING == 'Celery':
            from celery import group
            from celery_tasks import signature_java_worker

            # group jobs
            input_count = len(input_list)
            for index in range(0, input_count, JOB_CHUNK):
                tmp_input_list = input_list[index:min(index +
                                                      JOB_CHUNK, input_count)]
                if index + JOB_CHUNK > input_count:
                    logger.info("Processing the %d %d input" %
                                (index / JOB_CHUNK + 1, input_count - index))
                else:
                    logger.info("Processing the %d %d input" %
                                (index / JOB_CHUNK + 1, JOB_CHUNK))
                job = group(
                    signature_java_worker.s(item, input_type)
                    for item in tmp_input_list)
                result = job.apply_async()
                try:
                    result.get()
                except Exception as e:
                    logger.error("Error signaturing jobs: %s", str(e))

        else:  # non-parallel instance
            pb = utils.Progressbar('Matching libs/apps: ', count)
            pb.start()

            count = 0
            for item in input_list:

                # check for interruption
                if signal.caught():
                    break

                if main.TEST_REPO:
                    pb.msg('Testing {0} '.format(item))
                else:
                    pb.msg('Signaturing {0} '.format(item))

                # signature libs/apps
                signature_classes(main=main,
                                  input_path=item,
                                  input_type=input_type)

                # update progressbar
                count += 1
                pb.update(count)

            # all done
            if not signal.caught() and pb:
                pb.finish()

    else:
        logger.error("No lib(s) to signature")
Example #8
0
def run_validator(main, argv):
    global logger, stats_logger
    logger = main.logger
    stats_logger = main.stats_logger

    if not len(argv) == 1:
        logger.error('expectes args: $apks_to_validate, but get: %s', argv)
        exit(1)

    input_path = argv[0]
    if not exists(input_path):
        logger.error("%s does not exist", input_path)
        exit(1)

    input_list = get_input_list(main=main,
                                redis=main.rrc.handle(),
                                redis_pipe=main.rrc.pipeline(),
                                input_path=input_path,
                                path_as_id=True,
                                skip_scanned=main.ignore_scanned)
    # deduplicate!
    input_list = list(set(input_list))

    # start crawling
    if input_list:
        # track progress
        count = len(input_list)
        logger.info("Validating %d applications", count)

        # register signal handler
        signal = utils.Signal()
        signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM])

        pb = utils.Progressbar('Validating applications: ', count)
        pb.start()

        if main.QUEUING and main.QUEUING == "Celery":
            from celery import group
            from celery_tasks import validate_worker

            # group jobs
            job = group(validate_worker.s(app_path) for app_path in input_list)
            result = job.apply_async()

            # track worker progress
            completed = 0
            while (result.waiting()):
                completed += result.completed_count()
                if completed < count:
                    pb.update(completed)
                time.sleep(2)

        else:  # non-parallel instance

            count = 0

            # scan loop
            for app_path in input_list:
                # check for interruption
                if signal.caught():
                    break

                if validate_apk(main=main, app_path=app_path):
                    count += 1

                # update progressbar
                pb.update(count)

            if not signal.caught():
                pb.finish()