Ejemplo n.º 1
0
def test_model(testing_set):
    """ Test the LSTM model."""
    # For reporting current metrics
    freq_s = options.status_interval * 60
    last_s = datetime.now()

    res = [0.0] * len(model.metrics_names)
    batches = 0
    num_samples = len(testing_set)

    for status in map_to_model(testing_set, model.test_on_batch):
        if status is None:
            break
        for stat in range(len(status)):
            res[stat] += status[stat]
        batches += 1
        # Print current metrics every minute
        if (datetime.now() - last_s).total_seconds() > freq_s:
            c_metrics = [status / batches for status in res]
            c_metrics_str = ', '.join([str(model.metrics_names[x]) + ' ' + '%.12f' % (c_metrics[x]) for x in range(len(c_metrics))])
            c_metrics_str += ', progress %.4f' % (float(generator.fin_tasks.value) / float(num_samples))
            logger.log_info(module_name, 'Status: ' + c_metrics_str)
            last_s = datetime.now()

    if batches < 1:
        logger.log_warning(module_name, 'Testing set did not generate a full batch of data, cannot test')
        return

    for stat in range(len(res)):
        res[stat] /= batches

    logger.log_info(module_name, 'Results: ' + ', '.join([str(model.metrics_names[x]) + ' ' + str(res[x]) for x in range(len(res))]))
Ejemplo n.º 2
0
def train_model(training_set):
    """ Trains the LSTM model."""
    start_time = datetime.now()
    # Checkpointing for saving model weights
    freq_c = options.checkpoint_interval * 60
    last_c = datetime.now()
    # For reporting current metrics
    freq_s = options.status_interval * 60
    last_s = datetime.now()

    res = [0.0] * len(model.metrics_names)
    batches = 0
    for status in map_to_model(training_set, model.train_on_batch):
        if status is None:
            break
        for stat in range(len(status)):
            res[stat] += status[stat]
        batches += 1
        # Print current metrics every minute
        if (datetime.now() - last_s).total_seconds() > freq_s:
            c_metrics = [status / batches for status in res]
            c_metrics_str = ', '.join([
                str(model.metrics_names[x]) + ' ' + str(c_metrics[x])
                for x in range(len(c_metrics))
            ])
            logger.log_info(MODULE_NAME, 'Status: ' + c_metrics_str)
            last_s = datetime.now()
        # Save current weights at user specified frequency
        if freq_c > 0 and (datetime.now() - last_c).total_seconds() > freq_c:
            logger.log_debug(MODULE_NAME, 'Checkpointing weights')
            try:
                model.save_weights(options.save_weights)
            except:
                generator.stop_generator(10)
                clean_exit(
                    EXIT_RUNTIME_ERROR, "Failed to save LSTM weights:\n" +
                    str(traceback.format_exc()))
            last_c = datetime.now()

    if batches < 1:
        logger.log_warning(
            MODULE_NAME,
            'Testing set did not generate a full batch of data, cannot test')
        return

    for stat in range(len(res)):
        res[stat] /= batches

    logger.log_info(
        MODULE_NAME, 'Results: ' + ', '.join([
            str(model.metrics_names[x]) + ' ' + str(res[x])
            for x in range(len(res))
        ]))
    logger.log_debug(
        MODULE_NAME,
        'Training finished in ' + str(datetime.now() - start_time))

    return res[0]  # Average Loss
Ejemplo n.º 3
0
def add_cache(hash, acc, con):
    if not os.path.isdir(CACHE_DIR):
        logger.log_warning(module_name, "Cache directory does not exist, cannot update it")
        return

    ofp = os.path.join(CACHE_DIR, hash)
    data = (acc, con)
    if not os.path.exists(ofp):
        with open(ofp, 'wb') as ofile:
            pickle.dump(data, ofile)
Ejemplo n.º 4
0
def get_cache(hash):
    if not is_cached(hash):
        return None

    ofp = os.path.join(CACHE_DIR, hash)
    with open(ofp, 'rb') as ifile:
        try:
            return pickle.load(ifile)
        except Exception as ex:
            logger.log_warning(module_name, "Failed to access cache: " + str(ex))
            return None
Ejemplo n.º 5
0
def load_file(file_path):
    file_suffix = os.path.splitext(file_path)[1].lower()
    if file_suffix == '.json':
        return load_json_file(file_path)
    elif file_suffix in ['.yaml', '.yml']:
        return load_yaml_file(file_path)
    elif file_suffix == ".csv":
        return load_csv_file(file_path)
    else:
        err_msg = u"Unsupported file format: {}".format(file_path)
        logger.log_warning(err_msg)
        return []
Ejemplo n.º 6
0
def warn_and_debug(has_warned, warning, debug):
    """ Prints a debug message and also generates a generic warning message if one hasn't
        been produced before.

    The point is so we know there were problems without spamming the warning log level.
    """
    if not has_warned:
        logger.log_warning(module_name, warning)
        has_warned = True

    logger.log_debug(module_name, debug)

    return has_warned
Ejemplo n.º 7
0
def lookup_bin(name):
    """ Looks up the path to a bin using Linux environment variables.

    Not as robust as a program like which, but should be good enough.
    """
    logger.log_debug(module_name, 'PATH = ' + str(os.environ['PATH']))
    path_dirs = os.environ['PATH'].split(':')
    for path_dir in path_dirs:
        candidate = os.path.join(path_dir, name)
        if os.path.isfile(candidate):
            return candidate
    logger.log_warning(module_name, 'Failed to find ' + str(name))
    return '' # Failed to find a match
Ejemplo n.º 8
0
def eval_worker_loop(temp_dir, sample):
    o_filename = sample['label'] + '-' + path.basename(
        sample['base_dir']) + '.gz'
    o_filepath = path.join(temp_dir, o_filename)
    logger.log_debug(module_name, 'Writing to ' + o_filepath)
    with gzip.open(o_filepath, 'wt') as ofile:
        if options.preprocess:
            gen_func = reader.read_preprocessed
        else:
            gen_func = reader.disasm_pt_file

        iqueue, oqueue = generator.start_generator(1, gen_func,
                                                   options.queue_size,
                                                   options.seq_len, redis_info)

        if options.preprocess:
            iqueue.put((None, sample['parsed_filepath']))
        else:
            sample_memory = reader.read_memory_file(sample['mapping_filepath'])
            if sample_memory is None:
                logger.log_warning(module_name,
                                   'Failed to parse memory file, skipping')
                generator.stop_generator(10)
                return
            iqueue.put(
                (None, sample['trace_filepath'], bin_dirpath, sample_memory))

        while True:
            try:
                res = oqueue.get(True, 5)
            except:
                in_service = generator.get_in_service()
                if in_service == 0:
                    break
                else:
                    logger.log_debug(
                        module_name,
                        str(in_service) + ' workers still working on jobs')
                    continue

            xs = res[1][1:]
            ys = res[1][0] % options.max_classes

            predict, conf = predict_prob(xs, ys)
            corr = int(predict == ys)
            ofile.write(
                str(corr) + ',' + str(predict) + ',' + str(conf) + ',' +
                str(ys) + "\n")

        generator.stop_generator(10)
Ejemplo n.º 9
0
def map_to_model(samples, f):
    """ A helper function because train_on_batch() and test_on_batch() are so similar."""
    random.shuffle(samples)
    # There's no point spinning up more worker threads than there are samples
    threads = min(options.threads, len(samples))

    if options.preprocess:
        gen_func = reader.read_preprocessed
    else:
        gen_func = reader.disasm_pt_file

    # When you gonna fire it up? When you gonna fire it up?
    iqueue, oqueue = generator.start_generator(threads, gen_func, options.queue_size, options.seq_len,
                                               options.embedding_in_dim, options.max_classes, options.batch_size)

    for sample in samples:
        if options.preprocess:
            iqueue.put((None, sample['parsed_filepath']))
        else:
            sample_memory = reader.read_memory_file(sample['mapping_filepath'])
            if sample_memory is None:
                logger.log_warning(module_name, 'Failed to parse memory file, skipping')
                continue
            iqueue.put((None, sample['trace_filepath'], options.bin_dir, sample_memory, options.timeout))

    # Get parsed sequences and feed them to the LSTM model
    batch_cnt = 0
    while True:
        try:
            res = oqueue.get(True, 5)
        except queue.Empty:
            in_service = generator.get_in_service()
            if in_service == 0:
                break
            else:
                logger.log_debug(module_name, str(in_service) + ' workers still working on jobs')
                continue

        yield f(res[1], res[2])
        batch_cnt += 1

    logger.log_info(module_name, "Processed " + str(batch_cnt) + " batches, " + str(batch_cnt * options.batch_size) + " samples")

    generator.stop_generator(10)
    # End of generator
    while True:
        yield None
Ejemplo n.º 10
0
def insert(seq, dst):
    seq_len = len(seq)
    if seq_len > max_seq:
        logger.log_warning(
            module_name,
            'Tried to insert sequence that exceeds max sequence length')
        return

    src_key = str(seq)

    if src_key in edges[seq_len] and dst in edges[seq_len][src_key]:
        edges[seq_len][src_key][dst] += 1
    elif src_key in edges[seq_len]:
        edges[seq_len][src_key][dst] = 1
    else:
        edges[seq_len][src_key] = dict()
        edges[seq_len][src_key][dst] = 1
def kodi_json_request(params):
    data = json.dumps(params)
    request = xbmc.executeJSONRPC(data)

    try:
        response = json.loads(request)
    except UnicodeDecodeError:
        response = json.loads(request.decode('utf-8', 'ignore'))

    try:
        if 'result' in response:
            return response['result']
        return None
    except KeyError:
        logger.log_warning("[%s] %s" %
                    (params['method'], response['error']['message']))
        return None
Ejemplo n.º 12
0
def load_sets():
    if not path.isfile(options.input_sets):
        clean_exit(EXIT_INVALID_ARGS, "Cannot find file " + str(options.input_sets))

    set_key = None

    try:
        with open(options.input_sets, 'r') as ifile:
            for line in ifile:
                line = line.rstrip()
                if len(line) < 1:
                    continue
                if line[0] == '[':
                    set_key = line[1:-1]
                else:
                    # Line should be the path to a trace directory
                    if not root_dir in line:
                        logger.log_warning(module_name, 'Input data specified with -i must be in ' + str(root_dir) + ', skipping')
                        continue
                    if not path.isdir(line):
                        logger.log_warning(module_name, 'Cannot find directory ' + str(line) + ' to load data from, skipping')
                        continue
                    matches = [record for record in fs if record['base_dir'] == line]
                    if len(matches) < 1:
                        logger.log_warning(module_name, 'Could not find data in directory ' + str(line) + ', skipping')
                        continue
                    sets_meta[set_key].append(matches[0])
    except:
        clean_exit(EXIT_RUNTIME_ERROR, "Failed to load sets from " + str(options.input_sets))
Ejemplo n.º 13
0
def map_to_model(samples, f):
    """ A helper function because train_on_batch() and test_on_batch() are so similar."""
    global redis_info
    global oqueue

    random.shuffle(samples)
    # There's no point spinning up more worker threads than there are samples
    threads = min(options.threads, len(samples))

    if options.preprocess:
        gen_func = reader.read_preprocessed
    else:
        gen_func = reader.disasm_pt_file

    # When you gonna fire it up? When you gonna fire it up?
    iqueue, oqueue = generator.start_generator(threads, gen_func,
                                               options.queue_size,
                                               options.seq_len, redis_info)

    for sample in samples:
        if options.preprocess:
            iqueue.put((None, sample['parsed_filepath']))
        else:
            sample_memory = reader.read_memory_file(sample['mapping_filepath'])
            if sample_memory is None:
                logger.log_warning(module_name,
                                   'Failed to parse memory file, skipping')
                continue
            iqueue.put(
                (None, sample['trace_filepath'], bin_dirpath, sample_memory))

    ncpu = cpu_count()
    workers = Pool(ncpu)
    res = workers.map(worker_loop, [f] * ncpu)

    generator.stop_generator(10)

    return sum(res) / len(res)
Ejemplo n.º 14
0
def disasm_timeout(proc):
    """ Termiantes ptxed proc and logs a warning message. """
    logger.log_warning(module_name, "Timeout reached, terminating early")
    proc.kill()
    DISASM_TIMEOUT.set()
Ejemplo n.º 15
0
def test_log_warning(caplog):
    """Test correct warning message is logged"""
    msg = 'Test warn'
    logger.log_warning(msg)
    assert msg in caplog.text
    assert 'WARNING' in caplog.text
Ejemplo n.º 16
0
def main():
    # Parse input arguments
    parser = OptionParser(
        usage='Usage: %prog [options] trace_directory bin_directory')
    parser.add_option(
        '-f',
        '--force',
        action='store_true',
        help='If a complete or partial output already exists, overwrite it.')
    parser.add_option(
        '-t',
        '--timeout',
        action='store',
        type='int',
        default=None,
        help='Max seconds to run before quitting (default: infinite).')
    parser.add_option(
        '-p',
        '--no-partial',
        action='store_true',
        help='If timeout is reached, do not save the partially parsed trace.')
    options, args = parser.parse_args()

    if len(args) < 2:
        parser.print_help()
        sys.exit(0)

    data_dir = args[0]
    bin_dir = args[1]

    logger.log_start(logging.INFO)

    # Input validation
    if not os.path.isdir(data_dir):
        logger.log_error(module_name, data_dir + ' is not a directory')
        logger.log_stop()
        sys.exit(1)

    if not os.path.isdir(bin_dir):
        logger.log_error(module_name, bin_dir + ' is not a directory')
        logger.log_stop()
        sys.exit(1)

    if options.timeout is None and options.no_partial:
        logger.log_warning(
            module_name, "Setting --no-partial without --timeout does nothing")

    # Make sure all the expected files are there
    mem_file = None
    trace_file = None

    files = os.listdir(data_dir)
    for file in files:
        if file == 'mapping.txt' or file == 'mapping.txt.gz':
            mem_file = os.path.join(data_dir, file)
        elif file == 'trace_0' or file == 'trace_0.gz':
            trace_file = os.path.join(data_dir, file)

    if mem_file is None:
        logger.log_error(
            module_name,
            'Could not find mapping.txt or mapping.txt.gz in ' + data_dir)
        logger.log_stop()
        sys.exit(1)

    if trace_file is None:
        logger.log_error(module_name,
                         'Could not find trace_0 or trace_0.gz in ' + data_dir)
        logger.log_stop()
        sys.exit(1)

    # Parse the memory file
    mem_map = reader.read_memory_file(mem_file)
    if mem_map is None:
        logger.log_error(module_name, 'Failed to parse memory mapping file')
        logger.log_stop()
        sys.exit(1)

    # We're ready to parse the trace
    o_filepath = os.path.join(data_dir, 'trace_parsed.gz')

    if os.path.isfile(o_filepath) and not options.force:
        logger.log_error(module_name, 'Preprocess file already exists')
        logger.log_stop()
        sys.exit(1)

    if os.path.isfile(o_filepath + '.part') and not options.force:
        logger.log_error(module_name, 'Partial preprocess file already exists')
        logger.log_stop()
        sys.exit(1)

    entries = 0
    with gzip.open(o_filepath + '.part', 'wb') as ofile:
        for instr in reader.disasm_pt_file(trace_file, bin_dir, mem_map,
                                           options.timeout):
            if instr is None:
                break
            ofile.write(pack_instr(instr))
            entries += 1

    if reader.DISASM_TIMEOUT.is_set() and options.no_partial:
        logger.log_info(module_name, "Deleting partial trace")
        os.remove(o_filepath + '.part')
    elif entries > 0:
        os.rename(o_filepath + '.part', o_filepath)
    else:
        logger.log_error(module_name, 'No output produced, empty file')
        os.remove(o_filepath + '.part')

    logger.log_stop()
Ejemplo n.º 17
0
        #if loop to determine operations#####################################################################
        # The ops codes are as followed
        # create_route:     		post_users
        # add_activity_route: 		put_activity_by_activity_name
        # get_id_route:     		get_user_by_id
        # get_user_route:     		get_users
        # get_name_route:     		get_user_by_name
        # delete_id_route:    		delete_user_by_id
        # delete_name_route:  		delete_user_by_name
        # delete_activity_route:	delete_activity_by_activity_id

        duplicated_flag = 0  #The duplicated_flag is used to skip the if loops for duplicated operation
        for msgID in msgIDList:  #This part of code tests for duplicates, if the msg_id is in the dictonary, skip if loops and send back original response
            if msgID == request['msg_id']:
                log_warning("REPLICATED msg_in: " + request['msg_id'])
                qout.write(msgIDList[msgID])
                duplicated_flag = 1
                break

        if duplicated_flag == 1: continue
        if request['method'] == None:
            log_fail("INVALID METHOD ID " + request['msg_id'])
            continue

        #print opnum of the request
        log_back("###### EXPECTED OP# : " + str(nextTopNum))
        log_back("!!!!!! REQUEST OP# : " + str(request['opnum']))

        msg_out = Message()  #creating empty message object for msg_out
        key = request['msg_id']  #assigning msg_id to key
Ejemplo n.º 18
0
def eval_model(eval_set):
    """ Evaluate the LSTM model."""
    random.shuffle(eval_set)
    # There's no point spinning up more worker threads than there are samples
    threads = min(options.threads, len(eval_set))

    if options.eval_dir is None:
        eval_dir = tempfile.mkdtemp(suffix='-lstm-pt')
    else:
        if not path.exists(options.eval_dir):
            mkdir(options.eval_dir)
        eval_dir = options.eval_dir
    logger.log_info(module_name, 'Evaluation results will be written to ' + eval_dir)

    if options.preprocess:
        gen_func = reader.read_preprocessed
    else:
        gen_func = reader.disasm_pt_file

    iqueue, oqueue = generator.start_generator(threads, gen_func, options.queue_size, options.seq_len,
                                               options.embedding_in_dim, options.max_classes, options.batch_size)

    for sample in eval_set:
        o_filename = sample['label'] + '-' + path.basename(sample['base_dir']) + '.gz'
        o_filepath = path.join(eval_dir, o_filename)
        EVAL_WRITE_LOCKS[o_filepath] = threading.Lock()
        if options.preprocess:
            iqueue.put((o_filepath, sample['parsed_filepath']))
        else:
            sample_memory = reader.read_memory_file(sample['mapping_filepath'])
            if sample_memory is None:
                logger.log_warning(module_name, 'Failed to parse memory file, skipping')
                continue
            iqueue.put((o_filepath, sample['trace_filepath'], options.bin_dir, sample_memory, options.timeout))

    # Use threads instead of processes to handle and write the prediction
    # results because I/O and numpy crunching do not require the GIL.
    EVAL_PRED_DONE.clear()
    wqueue = queue.Queue(options.queue_size)
    workers = list()
    for id in range(threads):
        worker = threading.Thread(target=eval_worker, args=(wqueue,))
        worker.daemon = True
        worker.start()
        workers.append(worker)

    while True:
        try:
            res = oqueue.get(True, 5)
        except queue.Empty:
            in_service = generator.get_in_service()
            if in_service == 0:
                break
            else:
                logger.log_debug(module_name, str(in_service) + ' workers still working on jobs')
                continue

        wqueue.put([res, model.predict_on_batch(res[1])])

    EVAL_PRED_DONE.set()
    logger.log_debug(module_name, "Waiting for eval workers to terminate")
    for worker in workers:
        worker.join()
    logger.log_debug(module_name, "All eval workers are done")
    generator.stop_generator(10)
Ejemplo n.º 19
0
def init_cache():
    if not os.path.isdir(CACHE_DIR):
        try:
            os.makedirs(CACHE_DIR)
        except Exception as ex:
            logger.log_warning(module_name, "Failed to create cache directory: " + str(ex))
Ejemplo n.º 20
0
def train_model(training_set):
    """ Trains the LSTM model."""
    start_time = datetime.now()
    # Checkpointing for saving model weights
    freq_c = options.checkpoint_interval * 60
    last_c = datetime.now()
    last_b = 10000
    # For reporting current metrics
    freq_s = options.status_interval * 60
    last_s = datetime.now()

    res = [0.0] * len(model.metrics_names)
    batches = 0
    num_samples = len(training_set)
    for status in map_to_model(training_set, model.train_on_batch):
        if status is None:
            break
        for stat in range(len(status)):
            res[stat] += status[stat]
        batches += 1
        # Print current metrics every minute
        if (datetime.now() - last_s).total_seconds() > freq_s:
            c_metrics = [status / batches for status in res]
            c_metrics_str = ', '.join([str(model.metrics_names[x]) + ' ' + '%.12f' % (c_metrics[x]) for x in range(len(c_metrics))])
            c_metrics_str += ', progress %.4f' % (float(generator.fin_tasks.value) / float(num_samples))
            logger.log_info(module_name, 'Status: ' + c_metrics_str)
            last_s = datetime.now()
        # Save current weights at user specified frequency
        if freq_c > 0 and (datetime.now() - last_c).total_seconds() > freq_c:
            logger.log_debug(module_name, 'Checkpointing weights')
            c_metrics = [status / batches for status in res]
            if not options.checkpoint_best or c_metrics[0] < last_b:
                try:
                    model.save_weights(options.save_weights)
                    if not options.multi_gpu is None:
                        template.save_weights(options.save_weights + '.single')
                except:
                    generator.stop_generator(10)
                    clean_exit(EXIT_RUNTIME_ERROR, "Failed to save LSTM weights:\n" + str(traceback.format_exc()))
            if options.checkpoint_es and c_metrics[0] > last_b:
                logger.log_info(module_name, 'Loss did not improve between checkpoints, early stopping and restoring last weights')
                generator.stop_generator(10)
                try:
                    model.load_weights(options.save_weights)
                except:
                    clean_exit(EXIT_RUNTIME_ERROR, "Failed to load LSTM weights:\n" + str(traceback.format_exc()))
                return
            last_b = c_metrics[0]
            last_c = datetime.now()

    if batches < 1:
        logger.log_warning(module_name, 'Testing set did not generate a full batch of data, cannot test')
        return

    for stat in range(len(res)):
        res[stat] /= batches

    logger.log_info(module_name, 'Results: ' + ', '.join([str(model.metrics_names[x]) + ' ' + str(res[x]) for x in range(len(res))]))
    logger.log_debug(module_name, 'Training finished in ' + str(datetime.now() - start_time))

    return res[0] # Average Loss
Ejemplo n.º 21
0
def add_filter(key):
    """ Adds a filter from the available_filters dictionary. """
    if key in available_filters.keys():
        enabled_filters.append(available_filters[key])
    else:
        logger.log_warning(module_name, str(key) + " not in available filters")
Ejemplo n.º 22
0
def disasm_pt_file(trace_path, bin_path, mem_mapping, timeout=None):
    """ Disassembles a PT trace into instructions and yields tuples.

    Each tuple contains the following elements:
        Source BBID -- the BB from which a transfer is happening
        Target BBID -- the BB the transfer ends up in
        Transfer Instruction -- the instruction that causes the transfer (e.g., ret).
        Full Instruction -- An array containing the parts of the full instruction (e.g., ['call', 'ptr', 'eax']).
        Full Instruction Size -- The length of the previously mentioned array.

    Note, the reason why Transfer Instruction and Full Instruction are both in the tuple despite being redundant
    is for backwards compatibility with older versions of the code.

    Keyword arguments:
    trace_path -- The filepath to a raw PT trace (may be gzipped).
    bin_path -- The path to a directory containing binaries for use by the disassembler.
    mem_mapping -- A linear array of tuples in the form (start_address, end_address, source_file).
    timeout -- If not None, the max number of seconds to disasm for. Event DISASM_TIMEOUT is set if timeout is reached.

    Yields:
    The tuples described above until EoF is reached, after which None is yielded.
    """
    global mem_map
    mem_map = mem_mapping

    # Some regular expressions
    re_block = re.compile('\[block\]')

    # Input validation
    ptxed_path = utils.lookup_bin('ptxed')
    if ptxed_path == '':
        logger.log_error(module_name, 'ptxed not found, cannot read ' + str(trace_path))
        return

    if not path.isfile(trace_path):
        logger.log_error(module_name, str(trace_path) + " does not exist or is not a file")
        return

    if not path.isdir(bin_path):
        logger.log_error(module_name, str(trace_path) + " does not exist or is not a directory")
        return

    temp_dir = tempfile.mkdtemp()

    # If file is gzipped, it must be decompressed first
    if trace_path[-3:] == '.gz':
        ifilepath = path.join(temp_dir, 'pt_data')
        logger.log_debug(module_name, 'Decompressing ' + str(trace_path) + ' into ' + str(ifilepath))
        start_time = datetime.now()
        with gzip.open(trace_path, 'rb') as cfile:
            with open(ifilepath, 'wb') as ofile:
                ofile.write(cfile.read())
        delta_time = datetime.now() - start_time
        logger.log_debug(module_name, 'Decompressing ' + str(trace_path) + ' completed in ' + str(delta_time))
    else:
        ifilepath = trace_path

    # Use ptxed to generate tuples
    command = [ptxed_path, '--block:show-blocks']
    for map in mem_map:
        start_addr = hex(map[0])
        filename = path.basename(map[2].replace("\\", '/'))
        binpath = path.join(bin_path, filename)
        if not path.isfile(binpath):
            logger.log_warning(module_name, binpath + ' does not exist')
            continue
        command.append('--raw')
        command.append(binpath + ':' + start_addr)
    command.append('--pt')
    command.append(ifilepath)

    logger.log_debug(module_name, 'Running ' + ' '.join(command))
    start_time = datetime.now()
    warning_msg = 'Non-critical problems while disasm trace, see debug level (-l) for more info'
    has_warned = False
    count = 0
    last_bbid = 0
    last_instr = None

    ptxed = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=1)
    DISASM_TIMEOUT.clear()
    if not timeout is None:
        watchdog = Timer(timeout, disasm_timeout, args=[ptxed])
        watchdog.start()

    for line in ptxed.stdout:
        line = line.decode()
        if re_block.match(line):
            try:
                head, start, end,  instr = line.split(' ', 3)
            except ValueError:
                break  # Can happen if watchdog kills ptxed

            if last_instr is None:
                # The first basic block doesn't have a previous block, skip it
                last_instr = instr
                continue

            # Extract the type from the previous instruction (e.g., ret)
            src_type = last_instr.split(' ')[2:]
            # Convert the target address into a BBID
            dst_bbid = get_bbid(int(start, 16))
            if not dst_bbid is None:
                yield (last_bbid, dst_bbid, src_type[0], src_type, len(src_type))
                last_bbid = dst_bbid
                count += 1
            else:
                has_warned = warn_and_debug(has_warned, warning_msg, 'Cannot find BBID for address ' + hex(dst_addr))
            last_instr = instr
            continue

    if not timeout is None:
        watchdog.cancel()

    delta_time = datetime.now() - start_time
    logger.log_info(module_name, 'Generated ' + str(count) + ' entries in ' + str(delta_time))

    # Cleanup temp dir
    shutil.rmtree(temp_dir)

    # End of generator
    while True:
        yield None
Ejemplo n.º 23
0
def parse_pt_dir(root):
    """ Parses a directory containing PT traces and meta data. This parser expects the
        following layout:

    root/
        <pdf_hash>/
            info.txt
            mapping.txt[.gz]
            trace_0[.gz]
            [trace_parsed.gz]
            [report.json.gz]
        [...]

    info.txt should contain two lines: the original filename and the ground truth label.
    mapping.txt (or optionally mapping.txt.gz if gzip compression is used) is the output
        of the volatility plugin psscan.
    trace_0 (or optionally trace_0.gz if gzip compression is used) is a raw PT trace.
    trace_parsed.gz is an optional file generated using preprocess.py.
    report.json.gz is an optional Cuckoo report file used by syscall.py.

    Returns:
    An array where each item contains the following information in dictionary form: directory,
        trace filepath, memory mapping filepath, info filepath, original filename, and label.
        Upon error, None is returned.
    """
    if not path.isdir(root):
        logger.log_error(module_name, str(root) + ' is not a directory')
        return None

    res = []
    entries = listdir(root)

    for entry in entries:

        entry_info = {'base_dir': path.join(root, entry)}

        if not path.isdir(entry_info['base_dir']):
            logger.log_debug(module_name, 'Skipping ' + str(entry) + ' because it is not a directory')
            continue

        entry_contents = listdir(entry_info['base_dir'])
        for file in entry_contents:
            if file == 'info.txt':
                entry_info['info_filepath'] = path.join(entry_info['base_dir'], file)
                with open(entry_info['info_filepath'], 'r') as ifile:
                    entry_info['original_filename'] = ifile.readline().strip()
                    entry_info['label'] = ifile.readline().strip()
            elif file == 'mapping.txt' or file == 'mapping.txt.gz':
                entry_info['mapping_filepath'] = path.join(entry_info['base_dir'], file)
            elif file == 'trace_0' or file == 'trace_0.gz':
                entry_info['trace_filepath'] = path.join(entry_info['base_dir'], file)
            elif file == 'trace_parsed.gz':
                entry_info['parsed_filepath'] = path.join(entry_info['base_dir'], file)
            elif file == 'report.json.gz':
                entry_info['cuckoo_report'] = path.join(entry_info['base_dir'], file)

        if len(entry_info.keys()) < 6:
            logger.log_warning(module_name, 'Could not find all the necessary files in ' + str(root) + ' skipping')
            logger.log_debug(module_name, 'Found keys: ' + str(entry_info.keys()))
        else:
            logger.log_debug(module_name, 'Adding entry with keys: ' + str(entry_info.keys()))
            res.append(entry_info)

    return res
Ejemplo n.º 24
0
        errors = True

    if not options.multi_gpu is None and options.multi_gpu < 2:
        logger.log_error(module_name, 'Value for multi-GPU mode option must be at least 2')
        errors = True

    if not options.preprocess and options.bin_dir is None:
        logger.log_error(module_name, 'Preprocessing (-p) is not set, so (--bin-dir) is required')
        errors = True

    if not options.bin_dir is None and not path.isdir(options.bin_dir):
        logger.log_error(module_name, 'Binary directory (--bin-dir) must be a directory')
        errors = True

    if options.checkpoint_interval == 0 and (options.checkpoint_best or options.checkpoint_es):
        logger.log_warning(module_name, 'Setting --checkpoint-best or --checkpoint-early-stop without --checkpoint does nothing')

    if options.learn_ret:
        filters.add_filter('ret')

    if options.learn_call:
        filters.add_filter('call')

    if options.learn_icall:
        filters.add_filter('icall')

    if options.learn_jmp:
        filters.add_filter('jmp')

    if options.learn_ijmp:
        filters.add_filter('ijmp')