Ejemplo n.º 1
0
def _maybe_extract(data_dir, extracted_data, archive):
    # If data_dir/extracted_data does not exist, extract archive in data_dir
    if not gfile.Exists(path.join(data_dir, extracted_data)):
        print('Extracting "%s"...' % archive)
        with tarfile.open(archive) as tar:
            members = list(tar.getmembers())
            for i, member in enumerate(members):
                print_progress(i + 1, len(members))
                tar.extract(member, path=data_dir)
Ejemplo n.º 2
0
def process(input, output, silence_length, silence_thresh, temp_dir, padding,
            language, ambient_noise, keep_temporary, silent, seek_step,
            max_workers):
    # open the audio file stored in file system
    speech = AudioSegment.from_file(input)

    print('Splitting (this could take a while...)') if not silent else None
    # split track where silence is <silence-length> ms. or bigger
    chunks = split_on_silence(
        speech,
        # must be silent for at least <silence-length> ms.
        min_silence_len=silence_length,
        # consider it silent if quieter than <silence-thresh> dBFS
        silence_thresh=silence_thresh,
        seek_step=seek_step)
    total = len(chunks)

    # create temporary dir if it doesn't exist
    try:
        os.mkdir(temp_dir)
    except (FileExistsError):
        pass

    # Create <padding> ms silence chunk
    silence = AudioSegment.silent(duration=padding)
    futures = []
    try:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # process each chunk
            for i, chunk in enumerate(chunks):
                futures.append(
                    executor.submit(
                        process_chunck, silence + chunk + silence, i,
                        os.path.basename(os.path.splitext(input)[0]), temp_dir,
                        ambient_noise, keep_temporary, language))
            print_progress(0, total,
                           prefix='Converting:') if not silent else None
            for i, future in enumerate(as_completed(futures)):
                if future.exception():
                    # if exception was not handled abort as conversion won't be able to complete
                    executor._threads.clear()
                    thread._threads_queues.clear()
                    raise future.exception()

                print_progress(i + 1, total,
                               prefix='Converting:') if not silent else None
    except Exception as e:
        sys.stderr.write('\nError: Canceling execution: {}\n'.format(e))
        sys.exit(1)

    print('\nSaving...') if not silent else None
    with open(output, 'w+') as f:
        for text in map(lambda f: f.result(), futures):
            if text != None:
                f.write('{}.\n'.format(text))
Ejemplo n.º 3
0
def _maybe_extract(target_dir, extracted_data, archive_path):
    # If target_dir/extracted_data does not exist, extract archive in target_dir
    extracted_path = path.join(target_dir, extracted_data)
    if not path.exists(extracted_path):
        print('No directory "%s" - extracting archive...' % archive_path)
        with tarfile.open(archive_path) as tar:
            members = list(tar.getmembers())
            for i, member in enumerate(members):
                print_progress(i + 1, len(members))
                tar.extract(member, path=target_dir)
    else:
        print('Found directory "%s" - not extracting it from archive.' % archive_path)
Ejemplo n.º 4
0
def _maybe_extract(target_dir, extracted_data, archive_path):
    # If target_dir/extracted_data does not exist, extract archive in target_dir
    extracted_path = path.join(target_dir, extracted_data)
    if not path.exists(extracted_path):
        print('No directory "%s" - extracting archive...' % archive_path)
        with tarfile.open(archive_path) as tar:
            members = list(tar.getmembers())
            for i, member in enumerate(members):
                print_progress(i + 1, len(members))
                tar.extract(member, path=target_dir)
    else:
        print('Found directory "%s" - not extracting it from archive.' %
              archive_path)
Ejemplo n.º 5
0
def _maybe_download(archive_name, target_dir, archive_url):
    # If archive file does not exist, download it...
    archive_path = path.join(target_dir, archive_name)
    if not path.exists(archive_path):
        print('No archive "%s" - downloading...' % archive_path)
        req = requests.get(archive_url, stream=True)
        total_size = int(req.headers.get('content-length', 0))
        done = 0
        with open(archive_path, 'wb') as f:
            for data in req.iter_content(1024 * 1024):
                done += len(data)
                f.write(data)
                print_progress(done, total_size)
    else:
        print('Found archive "%s" - not downloading.' % archive_path)
    return archive_path
Ejemplo n.º 6
0
def _maybe_download(archive_name, target_dir, archive_url):
    # If archive file does not exist, download it...
    archive_path = path.join(target_dir, archive_name)
    if not path.exists(archive_path):
        print('No archive "%s" - downloading...' % archive_path)
        req = requests.get(archive_url, stream=True)
        total_size = int(req.headers.get('content-length', 0))
        done = 0
        with open(archive_path, 'wb') as f:
            for data in req.iter_content(1024*1024):
                done += len(data)
                f.write(data)
                print_progress(done, total_size)
    else:
        print('Found archive "%s" - not downloading.' % archive_path)
    return archive_path
Ejemplo n.º 7
0
 def one_sample(sample):
     mp3_filename = path.join(*(sample[0].split('/')))
     mp3_filename = path.join(extracted_dir, mp3_filename)
     # Storing wav files next to the mp3 ones - just with a different suffix
     wav_filename = path.splitext(mp3_filename)[0] + ".wav"
     _maybe_convert_wav(mp3_filename, wav_filename)
     frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
     file_size = path.getsize(wav_filename)
     with lock:
         if int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(sample[1])):
             # Excluding samples that are too short to fit the transcript
             counter['too_short'] += 1
         elif frames / SAMPLE_RATE > MAX_SECS:
             # Excluding very long samples to keep a reasonable batch-size
             counter['too_long'] += 1
         else:
             # This one is good - keep it for the target CSV
             rows.append((wav_filename, file_size, sample[1]))
         print_progress(counter['all'], num_samples)
         counter['all'] += 1
Ejemplo n.º 8
0
 def one_sample(sample):
     mp3_filename = path.join(*(sample[0].split('/')))
     mp3_filename = path.join(extracted_dir, mp3_filename)
     # Storing wav files next to the mp3 ones - just with a different suffix
     wav_filename = path.splitext(mp3_filename)[0] + ".wav"
     _maybe_convert_wav(mp3_filename, wav_filename)
     frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
     file_size = path.getsize(wav_filename)
     with lock:
         if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
             # Excluding samples that are too short to fit the transcript
             counter['too_short'] += 1
         elif frames/SAMPLE_RATE > MAX_SECS:
             # Excluding very long samples to keep a reasonable batch-size
             counter['too_long'] += 1
         else:
             # This one is good - keep it for the target CSV
             rows.append((wav_filename, file_size, sample[1]))
         print_progress(counter['all'], num_samples)
         counter['all'] += 1
Ejemplo n.º 9
0
def match(data_in, fp_in, ip=None, force=False, binary=False, log_path=None, test=False, print_report=False, latex=False):
    data = load_data(data_in)
    fps = joblib.load(fp_in)

    method_names = list(fps["method_fingerprints"].keys())

    print("Matching ...")

    if log_path:
        log_file = open(log_path, "w")

    results = []

    if not ip:
        for method_name in method_names:
            if not methods.methods.get(method_name):
                print("Warning: no such method '{}'".format(method_name))
                continue
            if not fps["method_fingerprints"].get(method_name):
                print("Warning: the fingerprint file does not contain a fingerprint for method '{}'".format(method_name))
                continue
            method = methods.methods[method_name]
            method.use_fingerprints(fps["method_fingerprints"][method_name])
            if test:
                configs = method.get_configs()
            else:
                configs = [method.get_default_config()]

            num_matched = 0

            for conf in configs:
                method.use_config(conf)

                start = time.time()

                num_matched = 0
                y_true = []
                y_pred = []
                labels = []
                print_progress(0, len(data))

                if test:
                    # cannot use pool in test because diff method needs to cache results
                    # which won't work with multiprocessing
                    match_map = map(functools.partial(method.match, force=force, test=test), data.values())
                else:
                    pool = multiprocessing.Pool(4)
                    match_map = pool.imap_unordered(functools.partial(method.match, force=force, test=test), data.values())

                count = 0
                last_count = 0
                time_window_start = time.time()
                time_left=None
                for host, matches in match_map:
                    if count > 0:
                        if time.time() - time_window_start > 2:
                            elapsed = time.time() - time_window_start
                            avg_host_time = elapsed/(count-last_count)
                            time_left = int(avg_host_time*(len(data)-count))
                            last_count = count
                            time_window_start = time.time()
                    count += 1
                    print_progress(count, len(data), estimated_time=time_left)

                    host_labels = host.label_str()
                    if (method.is_binary_classifier() or binary) and host_labels != "unlabeled":
                        host_labels = "malicious"
                    if host_labels not in labels:
                        labels.append(host_labels)

                    match_labels = Label.to_str(matches)
                    if (method.is_binary_classifier() or binary) and match_labels != "unlabeled":
                        match_labels = "malicious"
                    if match_labels not in labels:
                        labels.append(match_labels)

                    if match_labels != "unlabeled" and not print_report:
                        print("\x1b[2K\r{}: {}".format(host.ip, match_labels))
                    y_true.append(labels.index(host_labels))
                    y_pred.append(labels.index(match_labels))

                if not test:
                    pool.close()

                end = time.time()

                if print_report:
                    report = classification_report(y_true, y_pred, target_names=labels, zero_division=0, digits=5, output_dict=True if latex else False)
                    if latex:
                        report = report_to_latex_table(report)

                    perf_text = " ----- Performance result -----\n"
                    perf_text += "Method: {}\n".format(method_name)
                    perf_text += "Config: " + ", ".join("{} = {}".format(k, v) for k, v in conf.items()) + "\n"
                    perf_text += "Classification report:\n"
                    perf_text += str(report) + "\n"
                    perf_text += "Confusion Matrix (x-axis: guess, y-axis: true):\n"
                    perf_text += "Labels: {}\n".format(labels)
                    perf_text += str(confusion_matrix(y_true, y_pred)) + "\n"
                    perf_text += "Took {} seconds to perform".format(end-start)
                    perf_text += "\n\n"

                    precision = precision_score(y_true, y_pred, average="micro")
                    results.append({"method": method_name, "config": conf, "precision": precision})

                    if log_path:
                        log_file.write(perf_text)
                        log_file.flush()

                    print("")
                    print(perf_text)

                if hasattr(method, "post_match"):
                    method.post_match()

        # if two or more methods were used, print precision ranking
        if len(results) > 1:
            result_text = " ----- Best performing method/config -----\n"
            for i, result in enumerate(sorted(results, key=lambda k: k["precision"], reverse=True)):
                result_text += "{}.\n".format(i+1)
                result_text += "Method: {}\n".format(result["method"])
                result_text += "Config: " + ", ".join("{} = {}".format(k, v) for k, v in result["config"].items()) + "\n"
                result_text += "Precision: {}\n\n".format(result["precision"])

            if log_path:
                log_file.write(result_text)

            print(result_text)

    else:
        host = data.get(ip)

        if not host:
            print("Error: No host {} exists in data file.".format(ip))
            sys.exit(1)

    if log_path:
        log_file.close()
Ejemplo n.º 10
0
def database_extract(output, database, label_path, pcap_path, keep):
    host_map = {}
    tls_map = {}

    for db_file in database:
        print("Extracting data from {} ...".format(db_file))
        try:
            open(db_file, "r")
            dbh = sqlite3.connect(db_file)
        except:
            print("error: Failed opening database '{}'.".format(db_file))
            sys.exit(1)

        dbh.row_factory = sqlite3.Row

        curse = dbh.cursor()
        curse.execute("SELECT COUNT(*) FROM Probe;")
        total_rows = curse.fetchone()[0]

        curse.execute("SELECT * FROM Probe;")

        processed_rows = 0

        while True:
            row = curse.fetchone()
            print_progress(processed_rows, total_rows)
            processed_rows += 1

            if not row:
                break

            ip = row["ip"]
            uuid = row["uuid"]

            if not host_map.get(ip):
                host_map[ip] = modules.host.Host(ip, uuid)

            if keep != "both" and host_map[ip].uuid != uuid:
                if keep == "old":
                    # don't use the probe data that comes from newer scan
                    continue
                elif keep == "new":
                    # keep the newer scan , trash the older probe data
                    host_map[ip] = modules.host.Host(ip, uuid)
                    if ip in tls_map:
                        del tls_map[ip]

            module_name = row["name"]
            port = row["port"]

            if port == 0:
                mod_obj = modules.get_module(module_name)
                if not mod_obj:
                    continue
                # ip module stuff
                mod_obj.add_data(row)

                if mod_obj.name == "geoip":
                    host_map[ip].geoip = mod_obj
                elif mod_obj.name == "rdns":
                    host_map[ip].rdns = mod_obj
            else:
                # module stuff
                if module_name == "tls":
                    if ip not in tls_map:
                        tls_map[ip] = {}
                    port_obj = tls_map[ip].get(port)
                    if not port_obj:
                        port_obj = modules.get_port("tls", port)
                        tls_map[ip][port] = port_obj
                else:
                    port_obj = host_map[ip].ports.get(port)
                    if not port_obj:
                        port_obj = modules.get_port(module_name, port)
                        host_map[ip].insert_port(port_obj)

                try:
                    port_obj.add_data(row)
                except Exception as e:
                    print("Error adding data for {}:{}".format(ip, port))
                    import traceback
                    traceback.print_exc()
                    sys.exit(1)

        curse.close()
        print("")

    # adding tls module to ports
    for ip, port_map in tls_map.items():
        for port, tls in port_map.items():
            port_obj = host_map[ip].ports.get(port)
            if not port_obj:
                port_obj = modules.get_port("generic", port)
                host_map[ip].insert_port(port_obj)
            port_obj.tls = tls

    # remove ip that doesn't have any ports open, or none gives any response
    print("Filtering hosts without any ports open")

    remove_ip = set()
    for ip in host_map:
        if len(host_map[ip].ports) == 0:
            # TODO: add a flag that decides whether to exclude this or not
            #print("{}: No ports open, omitting".format(ip))
            remove_ip.add(ip)
            continue

        """if len(host_map[ip].responsive_ports()) == 0:
            # TODO: add a flag that decides whether to exclude this or not
            print("{}: No ports responded, omitting".format(ip))
            remove_ip.append(ip)
            continue"""

    for ip in remove_ip:
        del host_map[ip]
    print("Filtered {} hosts".format(len(remove_ip)))

    # add labels to hosts
    if label_path:
        print("Adding labels to hosts")
        with open(label_path, "r") as f:
            line = f.readline()
            while line != "":
                csv = line.strip().split(",")
                line = f.readline()

                if len(csv) != 4:
                    continue

                mwdb_id, ip, port, family = csv
                if ip in host_map:
                    try:
                        port = int(port)
                    except:
                        # some c2 doesn't have port specified in label
                        port = None
                        pass

                    host_map[ip].add_label(mwdb_id, family, port)

        # remove labels where label port is not open
        # and remove the ip if it loses all label, since it means the relevant (C2 acting) port is closed
        print("Filtering hosts without any label ports open")

        remove_ip = set()
        for ip in host_map:
            if host_map[ip].filter_labels():
                remove_ip.add(ip)

        for ip in remove_ip:
            del host_map[ip]
        print("Filtered {} hosts".format(len(remove_ip)))

    if pcap_path:
        print("Adding pcap data...")
        pcap_extract(pcap_path, host_map)


    # TODO: serialize host object

    print("{} hosts processed".format(len(host_map)))
    print("Saving data to file {} ...".format(output))

    joblib.dump(host_map, output)

    dbh.close()
Ejemplo n.º 11
0
def _maybe_convert_set(extracted_dir, source_csv, target_csv):
    print()
    if path.exists(target_csv):
        print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv))
        return
    print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv))

    train_dir = path.join(extracted_dir, TRAIN_DIR_NAME)
    dev_dir = path.join(extracted_dir, DEV_DIR_NAME)
    test_dir = path.join(extracted_dir, TEST_DIR_NAME)

    train_files = glob(path.join(train_dir, '*.mp3'))
    dev_files = glob(path.join(dev_dir, '*.mp3'))
    test_files = glob(path.join(test_dir, '*.mp3'))

    samples = []
    with open(source_csv) as source_csv_file:
        reader = csv.DictReader(source_csv_file)
        for row in reader:

            if (((TRAIN_CSV_NAME in source_csv) and any(
                    str(row['filename']) in train_file for train_file in train_files)) or
                    ((DEV_CSV_NAME in source_csv) and any(
                        str(row['filename']) in dev_file for dev_file in dev_files)) or
                    ((TEST_CSV_NAME in source_csv) and any(
                        str(row['filename']) in test_file for test_file in test_files))):
                samples.append((row['filename'], row['text']))

    # Mutable counters for the concurrent embedded routine
    counter = {'all': 0, 'too_short': 0, 'too_long': 0}
    lock = Lock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        mp3_filename = path.join(*(sample[0].split('/')))
        mp3_filename = path.join(extracted_dir, mp3_filename)
        # Storing wav files next to the mp3 ones - just with a different suffix
        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
        _maybe_convert_wav(mp3_filename, wav_filename)
        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
        file_size = path.getsize(wav_filename)
        with lock:
            if int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(sample[1])):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames / SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, sample[1]))
            print_progress(counter['all'], num_samples)
            counter['all'] += 1

    print('Importing mp3 files...')
    pool = Pool(cpu_count())
    pool.map(one_sample, samples)
    pool.close()
    pool.join()

    print_progress(num_samples, num_samples)

    print('Writing "%s"...' % target_csv)
    with open(target_csv, 'w') as target_csv_file:
        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        for i, row in enumerate(rows):
            filename, file_size, transcript = row
            print_progress(i + 1, len(rows))
            writer.writerow({'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript})

    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
Ejemplo n.º 12
0
def _maybe_convert_set(extracted_dir, source_csv, target_csv):
    print()
    if path.exists(target_csv):
        print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv))
        return
    print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv))
    samples = []
    with open(source_csv) as source_csv_file:
        reader = csv.DictReader(source_csv_file)
        for row in reader:
            samples.append((row['filename'], row['text']))

    # Mutable counters for the concurrent embedded routine
    counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
    lock = Lock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        mp3_filename = path.join(*(sample[0].split('/')))
        mp3_filename = path.join(extracted_dir, mp3_filename)
        # Storing wav files next to the mp3 ones - just with a different suffix
        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
        _maybe_convert_wav(mp3_filename, wav_filename)
        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
        file_size = path.getsize(wav_filename)
        with lock:
            if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames/SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, sample[1]))
            print_progress(counter['all'], num_samples)
            counter['all'] += 1

    print('Importing mp3 files...')
    pool = Pool(cpu_count())
    pool.map(one_sample, samples)
    pool.close()
    pool.join()

    print_progress(num_samples, num_samples)

    print('Writing "%s"...' % target_csv)
    with open(target_csv, 'w') as target_csv_file:
        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        for i, row in enumerate(rows):
            filename, file_size, transcript = row
            print_progress(i + 1, len(rows))
            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })

    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))