Ejemplo n.º 1
0
def vectorize_subset(subset):
    # Checking dataset
    data_dir = os.path.join(os.getcwd(), 'ember')
    if subset == 'train':
        paths = [
            os.path.join(data_dir, "train_features_{}.jsonl".format(i))
            for i in range(6)
        ]
        n_rows = 900000
    elif subset == 'test':
        paths = [
            os.path.join(data_dir, "test_features.jsonl"),
        ]
        n_rows = 200000
    else:
        logging.error('subset must be "train" or "test"')
        sys.exit(1)
    for p in paths:
        if not os.path.exists(p):
            logging.info('File not found: {}'.format(p))
            sys.exit(1)
    X_path = os.path.join(data_dir, "X_{}.dat".format(subset))
    y_path = os.path.join(data_dir, "y_{}.dat".format(subset))

    if os.path.exists(X_path + '.shd256') and os.path.exists(y_path +
                                                             '.shd256'):
        with open(X_path + '.shd256', 'r') as f:
            X_checksum = f.read()
        with open(y_path + '.shd256', 'r') as f:
            y_checksum = f.read()
        if X_checksum == sha256_checksum(
                X_path) and y_checksum == sha256_checksum(y_path):
            logging.info('"{}" subset is vectorized'.format(subset))
            return

    # Allocate storage space
    dim = FeatureExtractor.dim
    X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(n_rows, dim))
    y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=n_rows)
    del X, y

    logging.info('Vectorzing samples in "{}" subset'.format(subset))
    pool = mp.Pool()
    arg_iterator = (
        (row, raw_data, X_path, y_path, n_rows)
        for row, raw_data in enumerate(raw_feature_iterator(paths)))
    for _ in tqdm(pool.imap_unordered(vectorize_data, arg_iterator),
                  unit='row',
                  unit_scale=True,
                  ncols=96,
                  miniters=1,
                  total=n_rows):
        pass

    X_checksum = sha256_checksum(X_path)
    with open(X_path + '.shd256', 'w') as f:
        f.write(X_checksum)
    y_checksum = sha256_checksum(y_path)
    with open(y_path + '.shd256', 'w') as f:
        f.write(y_checksum)
Ejemplo n.º 2
0
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump,
         output):
    """Malice PDF Plugin."""

    try:
        # set up logging
        init_logging(verbose)

        # TODO: check if PDF is too big (max size 3000000 ??)
        # TODO: if PDFiD fails maybe build different response JSON with errors etc.
        pdfid_results = MalPDFiD(file_path).run()
        pdf_results = {
            'pdfid':
            pdfid_results,
            'streams':
            MalPdfParser(file_path,
                         pdfid_results,
                         should_dump=dump,
                         dump_path=output,
                         verbose=verbose).run(),
        }
        # pdf_dict['pdf']['peepdf'] = MalPeepdf(file_path).run()
        malice_scan = {
            'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)),
            'name': 'pdf',
            'category': 'document',
            'results': pdf_results
        }
        malice_scan['results']['markdown'] = json2markdown(pdf_results)

        # write to elasticsearch
        if eshost:
            try:
                e = Elastic(eshost, timeout=timeout)
                e.write(results=malice_scan)
            except Exception as e:
                log.exception(
                    "failed to index malice/pdf results into elasticsearch")

        if table:
            print(malice_scan['results']['markdown'])
        else:
            print(json.dumps(pdf_results, indent=True))

        # POST dropped files as a JSON blob back to malice server/daemon
        if callback:
            proxies = None
            if proxy:
                proxies = {
                    'http': proxy,
                    'https': proxy,
                }
            malice_scan['parent'] = os.environ.get('MALICE_SCANID',
                                                   sha256_checksum(file_path))
            requests.post(callback, json=malice_scan, proxies=proxies)

    except Exception as e:
        log.exception("failed to run malice plugin: pdf")
        return
Ejemplo n.º 3
0
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump,
         output, peid):
    """Malice PExecutable Scanner"""

    try:
        # set up logging
        init_logging(verbose)

        # TODO: check if EXE is too big (max size 3000000 ??)
        pe_results = MalPEFile(file_path,
                               peid_db_path=peid,
                               should_dump=dump,
                               dump_path=output).run()

        malice_scan = {
            'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)),
            'name': 'pescan',
            'category': 'exe',
            'results': pe_results
        }
        try:
            malice_scan['results']['markdown'] = json2markdown(pe_results)
        except Exception as e:
            log.exception("failed to render jinja template")
            malice_scan['results']['markdown'] = e.message

        # write to elasticsearch
        if eshost:
            try:
                e = Elastic(eshost, timeout=timeout)
                e.write(results=malice_scan)
            except Exception as e:
                log.exception(
                    "failed to index malice/pescan results into elasticsearch")

        if table:
            print(malice_scan['results']['markdown'])
        else:
            pe_results.pop('markdown')
            print(json.dumps(pe_results, indent=True))

        # POST dropped files as a JSON blob back to malice server/daemon
        if callback:
            proxies = None
            if proxy:
                proxies = {
                    'http': proxy,
                    'https': proxy,
                }
            malice_scan['parent'] = os.environ.get('MALICE_SCANID',
                                                   sha256_checksum(file_path))
            requests.post(callback, json=malice_scan, proxies=proxies)

    except Exception as e:
        log.exception("failed to run malice plugin: pescan")
        return
Ejemplo n.º 4
0
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump, output, peid):
    """Malice PExecutable Scanner"""

    try:
        # set up logging
        init_logging(verbose)

        # TODO: check if EXE is too big (max size 3000000 ??)
        pe_results = MalPEFile(file_path, peid_db_path=peid, should_dump=dump, dump_path=output).run()

        malice_scan = {
            'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)),
            'name': 'pescan',
            'category': 'exe',
            'results': pe_results
        }
        try:
            malice_scan['results']['markdown'] = json2markdown(pe_results)
        except Exception as e:
            log.exception("failed to render jinja template")
            malice_scan['results']['markdown'] = e.message

        # write to elasticsearch
        if eshost:
            try:
                e = Elastic(eshost, timeout=timeout)
                e.write(results=malice_scan)
            except Exception as e:
                log.exception("failed to index malice/pescan results into elasticsearch")

        if table:
            print(malice_scan['results']['markdown'])
        else:
            pe_results.pop('markdown')
            print(json.dumps(pe_results, indent=True))

        # POST dropped files as a JSON blob back to malice server/daemon
        if callback:
            proxies = None
            if proxy:
                proxies = {
                    'http': proxy,
                    'https': proxy,
                }
            malice_scan['parent'] = os.environ.get('MALICE_SCANID', sha256_checksum(file_path))
            requests.post(callback, json=malice_scan, proxies=proxies)

    except Exception as e:
        log.exception("failed to run malice plugin: pescan")
        return
Ejemplo n.º 5
0
def scan(file_path, verbose, table, proxy, callback, eshost, timeout, dump,
         output):
    """Malice Office/OLE/RTF Scanner"""

    try:
        # set up logging
        init_logging(verbose)

        # TODO: check if DOC is too big (max size 3000000 ??)
        o_results = MalOffice(file_path, should_dump=dump,
                              dump_path=output).run()

        malice_scan = {
            'id': os.environ.get('MALICE_SCANID', sha256_checksum(file_path)),
            'name': 'office',
            'category': 'document',
            'results': o_results
        }
        malice_scan['results']['markdown'] = json2markdown(o_results)

        # write to elasticsearch
        if eshost:
            try:
                e = Elastic(eshost, timeout=timeout)
                e.write(results=malice_scan)
            except Exception as e:
                log.exception(
                    "failed to index malice/office results into elasticsearch")

        if table:
            print(malice_scan['results']['markdown'])
        else:
            o_results.pop('markdown')
            print(json.dumps(o_results, indent=True))

        # POST dropped files as a JSON blob back to malice server/daemon
        if callback:
            proxies = None
            if proxy:
                proxies = {
                    'http': proxy,
                    'https': proxy,
                }
            malice_scan['parent'] = os.environ.get('MALICE_SCANID',
                                                   sha256_checksum(file_path))
            requests.post(callback, json=malice_scan, proxies=proxies)

    except Exception as e:
        log.exception("failed to run malice plugin: office")
        return
Ejemplo n.º 6
0
 def addToIndex(cls, **kwargs):
     image_hash = util.sha256_checksum(kwargs['filename'])
     picture = session.query(cls).filter_by(file_hash=image_hash)
     if(picture.scalar()):
         util.logger.debug(
             'Attempted to add existing image to index, skipping...')
         return False
     image = ColorThief(kwargs['filename'])
     dominant_color = image.get_color(quality=6)
     cls.create(
         filename=kwargs['filename'],
         file_hash=util.sha256_checksum(kwargs['filename']),
         color=json.dumps(dominant_color),
         ts_indexed=datetime.datetime.now())
     session.commit()
Ejemplo n.º 7
0
    def __calc_doubt_dest__(self, factor):
        if self.doubts == None:
            return None
        src_noext, _ext = os.path.splitext(factor.src)

        hash = sha256_checksum(factor.src)
        bn = hash + _ext

        dst = os.path.join(self.doubts, bn)
        return dst
Ejemplo n.º 8
0
def save_numpy_file(subset):
    data_dir = os.path.join(os.getcwd(), 'ember')
    if subset == 'train':
        n_rows = 900000
    elif subset == 'test':
        n_rows = 200000
    else:
        logging.error('subset must be "train" or "test"')
        sys.exit(1)

    X_npy = os.path.join(data_dir, "X_{}.npy".format(subset))
    y_npy = os.path.join(data_dir, "y_{}.npy".format(subset))

    if os.path.exists(X_npy + '.shd256') and os.path.exists(y_npy + '.shd256'):
        with open(X_npy + '.shd256', 'r') as f:
            X_checksum = f.read()
        with open(y_npy + '.shd256', 'r') as f:
            y_checksum = f.read()
        if X_checksum == sha256_checksum(
                X_npy) and y_checksum == sha256_checksum(y_npy):
            logging.info(
                'Numpy files of "{}" subset is existed!'.format(subset))
            return

    logging.info(
        'Saving numpy files for labeled samples in "{}" subset'.format(subset))
    dim = FeatureExtractor.dim
    X_dat = os.path.join(data_dir, "X_{}.dat".format(subset))
    y_dat = os.path.join(data_dir, "y_{}.dat".format(subset))
    X = np.memmap(X_dat, dtype=np.float32, mode="r", shape=(n_rows, dim))
    y = np.memmap(y_dat, dtype=np.float32, mode="r", shape=n_rows)
    labeled_rows = (y != -1)
    np.save(X_npy, X[labeled_rows])
    np.save(y_npy, y[labeled_rows])

    X_checksum = sha256_checksum(X_npy)
    with open(X_npy + '.shd256', 'w') as f:
        f.write(X_checksum)
    y_checksum = sha256_checksum(y_npy)
    with open(y_npy + '.shd256', 'w') as f:
        f.write(y_checksum)
Ejemplo n.º 9
0
    def __determine_dest__(self, src, pref_dest):
        src_dir = path.dirname(src)
        src_bn = path.basename(src)
        src_bn_noext, src_ext = path.splitext(src_bn)

        dst_dir = path.dirname(pref_dest)
        dst_bn = path.basename(pref_dest)
        dst_bn_noext, dst_ext = path.splitext(dst_bn)

        op_ctx = OpContext(src)

        if path.exists(pref_dest):
            if self.hash_check:
                src_hash = sha256_checksum(src)
                siblings = []
                for sibling in os.listdir(dst_dir):
                    if sibling.startswith(dst_bn_noext) and sibling.endswith(
                            dst_ext):
                        siblings.append(sibling)
                        sibling_hash = sha256_checksum(
                            path.join(dst_dir, sibling))
                        if sibling_hash == src_hash:
                            op_ctx.duplication = path.join(dst_dir, sibling)
                            return op_ctx

            aimed = False
            id = 1
            while not aimed:
                new_dst_bn = dst_bn_noext + self.rename_pattern % (
                    id) + dst_ext
                new_dst = path.join(dst_dir, new_dst_bn)
                if not path.exists(new_dst):
                    op_ctx.dest = new_dst
                    return op_ctx
                id += 1
            raise AssertionError('Error happens')
        else:
            op_ctx.dest = pref_dest
            return op_ctx
Ejemplo n.º 10
0
    def __init__(self, file_path, should_dump=False, dump_path=None):
        self.file = file_path
        self.sha256 = sha256_checksum(self.file)
        self.data = open(file_path, 'rb').read()
        self.dump = None
        self.results = {}

        if not path.exists(self.file):
            raise Exception("file does not exist: {}".format(self.file))
        if should_dump:
            if path.isdir(dump_path):
                self.dump = dump_path
            else:
                log.error("folder does not exist: {}".format(dump_path))
                self.dump = None
Ejemplo n.º 11
0
    def __init__(self, file_path, should_dump=False, dump_path=None):
        self.file = file_path
        self.sha256 = sha256_checksum(self.file)
        self.data = open(file_path, 'rb').read()
        self.dump = None
        self.results = {}

        if not path.exists(self.file):
            raise Exception("file does not exist: {}".format(self.file))
        if should_dump:
            if path.isdir(dump_path):
                self.dump = dump_path
            else:
                log.error("folder does not exist: {}".format(dump_path))
                self.dump = None
Ejemplo n.º 12
0
print "------------------------------------"
print "zookeeper configuratioins : %s %s"  %(java64_home,zookeeper_hosts) # zookeeper configuratioins : /usr/jdk64/jdk1.8.0_112 [u'master', u'node1', u'node2']
print "------------------------------------"

all_hosts = default("/clusterHostInfo/all_hosts", [])
all_racks = default("/clusterHostInfo/all_racks", [])

cluster_name = config["clusterName"]

# clickhouse-config.xml
clickhouse_config_json_template = config['configurations']['clickhouse-config']
tcp_port = config['configurations']['clickhouse-config']['tcp_port']
users_config = config['configurations']['clickhouse-config']['users_config']
clickhouse_data_path = config['configurations']['clickhouse-config']['path']

# clickhouse-metrika cluster configurations
clickhouse_metrika_json_template = config['configurations']['clickhouse-metrika']

# clickhouse-user configurations
clickhouse_users_json_template = config['configurations']['clickhouse-users']['clickhouse_users']

user_admin = config['configurations']['clickhouse-users']['user_admin']

user_admin_password = config['configurations']['clickhouse-users']['user_admin_password']
user_admin_password_sha256 = utils.sha256_checksum(user_admin_password)

user_ck = config['configurations']['clickhouse-users']['user_ck']

user_ck_password = config['configurations']['clickhouse-users']['user_ck_password']
user_ck_password_sha256 = utils.sha256_checksum(user_ck_password)
Ejemplo n.º 13
0
            if args.verbose:
                print('Scanning %s' % dirName)
        except UnicodeEncodeError as e:
            continue

        for filename in fileList:
            full_path = os.path.join(dirName, filename)
            try:
                p = subprocess.Popen(['file', '--brief', full_path],
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)
                output, errors = p.communicate()
                file_type = str(output.strip().decode())

                if "exe" in file_type:
                    file_hash = utils.sha256_checksum(full_path)

                    file_info = dict()
                    file_info["path"] = full_path
                    file_info["type"] = file_type
                    file_info["sha256"] = file_hash

                    status = utils.check_file_status(file_info)

                    if status == utils.FILE_UNKNOWN:
                        utils.add_file_to_db(file_info)

                    elif status == utils.FILE_KNOWN_TOUCHED:
                        utils.add_alert_do_db(file_info)

                    elif status == utils.FILE_KNOWN_UNTOUCHED:
Ejemplo n.º 14
0
 def __init__(self, file_path):
     self.file = file_path
     self.sha256 = sha256_checksum(self.file)
     self.oPDFiD = None