Beispiel #1
0
def preprocess_data(source_dir, out_dir):
    """
    Loads data from source, then performs cleaning and preprocessing steps. Each
    preprocessed file is saved to the out directory.
    """

    source_path = pathlib.Path(source_dir)
    out_path = pathlib.Path(out_dir)

    # Ensure source exists. If not then we'll create it with symlinking.
    if not source_path.exists():

        # Create the parents. It's important we don't make the final directory
        # otherwise the symlink will fail since it already exists!
        ensure_path_exists(source_path.parent, is_dir=True)

        # Symlink data to make our source directory
        # print(f"Symlinking {source_path} to raw data from {DATA_DIRECTORY}")
        logging.info(
            f"Symlinking {source_path} to raw data from {DATA_DIRECTORY}")
        source_path.symlink_to(pathlib.Path(DATA_DIRECTORY),
                               target_is_directory=True)

    # Ensure out directory exists.
    ensure_path_exists(out_path, is_dir=True)

    # Clean out existing preprocessed files.
    # print(f"Removing existing files from `{out_path}`")
    logging.info(f"Removing existing files from `{out_path}`")
    for fp in out_path.iterdir():
        fp.unlink()

    # We're only working with data which used a VPN, so we can ignore the rest.
    to_process = [
        fp
        # We can pass a glob pattern to further constrain what files we look at.
        for fp in source_path.glob('*') if 'novpn' not in fp.name
    ]

    # We'll use a multiprocessing pool to parallelize our preprocessing since
    # it involves computation.
    args = [(filepath, out_path) for filepath in to_process]

    workers = multiprocessing.cpu_count()
    # print(f'Starting a processing pool of {workers} workers.')
    logging.info(f'Starting a processing pool of {workers} workers.')
    start = time.time()
    pool = multiprocessing.Pool(processes=workers)
    results = pool.map(_process_file, args)
    # print(f'Time elapsed: {round(time.time() - start)} seconds.')
    logging.info(f'Time elapsed: {round(time.time() - start)} seconds.')

    results = np.array(list(results))
    # print(f'{sum(results)} input files successfully preprocessed.')
    logging.info(f'{sum(results)} input files successfully preprocessed.')
    # print(f"{sum(~results)} files couldn't be procesed.")
    logging.info(f"{sum(~results)} files couldn't be procesed.")
Beispiel #2
0
def train_model(source, out, validation_size, classifier, model_params):
    """
    Trains model with data preparation and desired classifier.
    """

    df = pd.read_csv(source)
    X = df.drop(columns=['streaming']).values
    y = df['streaming'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=validation_size)

    #Normalizing data with respect to the entire training data
    scaler = StandardScaler()
    scaler.fit(X)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    #Choosing the desired model type
    classifier_type = {
        "RandomForest": RandomForestClassifier,
        "KNN": KNeighborsClassifier,
        "LogisticRegression": LogisticRegression
    }

    model_params = model_params[classifier]
    clf = classifier_type[classifier](**model_params)

    #Outputting prediction and accuracy score
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # print('%s model accuracy: %s' % (classifier, accuracy))
    logging.info('%s model accuracy: %s' % (classifier, accuracy))

    ensure_path_exists(out)
    with open(out, 'wb') as outfile:
        pickle.dump(clf, outfile)
    logging.info(f'Model saved to {out}')

    return clf
def collect_data(
    username, provider, quality, speed, vpn, platform, clean, date, interface):
    """
    Captures data from network_stats
    """

    logging.info(
        'This functionality is moving to an entirely new repository, and its '
        'development in this repository has ceased being supported.',
    )

    csvmode = '-e'
    date = datetime.date.today().strftime('%Y%m%d')
    network_stats = 'network-stats/network_stats.py'

    output_file = '{}-{}-{}-{}-{}-{}-{}-{}.csv'.format(username, provider, quality, speed, vpn, platform, clean, date)
    ensure_path_exists(output_file)
    command = 'python3.8 {} -i {} -s {} {}'.format(network_stats, interface, csvmode, output_file)
    # os.system(command)
    
    return
Beispiel #4
0
def train_model(source, out, validation_size, classifier, model_params):
    """
    Trains model with data preparation and desired classifier.
    """

    df = pd.read_csv(source)
    X = df.drop(columns=['provider']).values
    y = df['provider'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=validation_size)

    #Normalizing data with respect to the entire training data
    scaler = StandardScaler()
    scaler.fit(X)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    #Choosing the desired model type
    classifier_type = {
        "RandomForest": RandomForestClassifier,
        "KNN": KNeighborsClassifier,
        "LogisticRegression": LogisticRegression
    }

    model_params = model_params[classifier]
    clf = classifier_type[classifier](**model_params)

    #Outputting prediction and accuracy score
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # print('%s model accuracy: %s' % (classifier, accuracy))
    logging.info('%s model accuracy: %s' % (classifier, accuracy))

    # Plot non-normalized confusion matrix
    class_names = ['other', 'youtube', 'prime', 'netflix', 'yt-live', 'twitch']

    disp = plot_confusion_matrix(clf,
                                 X_test,
                                 y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=None)
    disp.ax_.set_title("Confusion matrix, without normalization")
    plt.savefig('confusion_matrix.png')
    plt.show()

    disp = plot_confusion_matrix(clf,
                                 X_test,
                                 y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize='true')
    disp.ax_.set_title("Normalized confusion matrix")
    plt.savefig('normalized_confusion_matrix.png')
    plt.show()

    ensure_path_exists(out)
    with open(out, 'wb') as outfile:
        pickle.dump(clf, outfile)
    logging.info(f'Model saved to {out}')

    return clf
def main(targets):

    # Will change to test config path if test target is seen
    config_dir = 'config'
    run_all = False

    # Set up logging
    with open(Path(config_dir, 'logging.json')) as f:
        logging_params = json.load(f)

    if logging_params['produce_logs']:
        log_file = logging_params['log_file']
        ensure_path_exists(log_file)
        logging.basicConfig(
            filename=log_file,
            filemode='a',
            format='%(asctime)s, %(name)s %(levelname)s %(message)s',
            datefmt='%H:%M:%S',
            level=logging.DEBUG)
        logging.info(f"{'*'*80}\nBEGIN RUN\n{'*'*80}")

    # Regardless of if a logfile is being collected, we should also get the logs
    # to show up in standard out.
    logging.getLogger().addHandler(logging.StreamHandler())

    if 'all' in targets or len(targets) == 0:
        run_all = True

    if 'clean' in targets:
        # Would probably just delete the data folder... but should truly look at
        # the configuration to decide what to delete.
        raise NotImplementedError

    if 'test' in targets:
        # If `test` is the only target seen, then run all targets with the
        # configs and data found in the test directory.
        #
        # Otherwise, if additional targets are specified then only run those
        # targets but still use test config (and therefore test data).
        # print('Test target recognized. Will use test configuration files.')
        logging.info(
            'Test target recognized. Will use test configuration files.')
        config_dir = 'test/config'

        if len(targets) == 1:
            # print('Testing all targets: `data`, `features`, `train`.')
            run_all = True

    if 'data' in targets or run_all:
        # Load, clean, and preprocess data. Then store preprocessed data to
        # configured intermediate directory.
        # print('Data target recognized.')
        logging.info('Data target recognized.')

        with open(Path(config_dir, 'data-params.json'), 'r') as f:
            data_params = json.load(f)

        print('Running ETL pipeline.')
        logging.info('Running ETL pipeline.')
        preprocess_data(**data_params)
        print('ETL pipeline complete.')
        logging.info('ETL pipeline complete.')

    if 'features' in targets or run_all:
        # Creates features for preprocessed data and stores feature-engineered
        # data to a configured csv and directory.
        # print('Features target recognized.')
        logging.info('Features target recognized.')

        with open(Path(config_dir, 'features-params.json'), 'r') as f:
            features_params = json.load(f)

        # print('Engineering features.')
        logging.info('Engineering features.')
        create_features(**features_params)
        # print('Feature engineering complete.')
        logging.info('Feature engineering complete.')

    if 'train' in targets or run_all:
        # Trains model based on feature-engineeered data, report some of its
        # scores, and save the model.
        # print('Train target recognized.')
        logging.info('Train target recognized.')

        with open(Path(config_dir, 'train-params.json'), 'r') as f:
            train_params = json.load(f)

        # print('Training model.')
        logging.info('Training model.')
        train_model(**train_params)
        # print('Model training complete.')
        logging.info('Model training complete.')

    if 'generate' in targets:
        # Generates data from network-stats
        #
        # NOTE: This target should *not* be included in `all`.
        # print('Generate target recognized.')
        logging.info('Generate target recognized.')

        with open(Path(config_dir, 'generate-params.json'), 'r') as f:
            generate_params = json.load(f)

        # print('Collecting data with network-stats.')
        logging.info('Collecting data with network-stats.')
        collect_data(**generate_params)
        # print('Data collection complete.')
        logging.info('Data collection complete.')

    return
def create_features(source_dir, out_dir, out_file, chunk_size,
                    rolling_window_1, rolling_window_2, resample_rate,
                    frequency):

    # Ensure that the output directory exists.
    ensure_path_exists(source_dir, is_dir=True)
    ensure_path_exists(out_dir, is_dir=True)

    #splitting dataframe into chunk_size'd chunks
    #chunk size is in milliseconds
    preprocessed_dfs = glob.glob(os.path.join(source_dir, 'preprocessed*'))
    split_df_groups = [split(f, chunk_size) for f in preprocessed_dfs]

    #flattening list
    merged = list(itertools.chain.from_iterable(split_df_groups))

    #streaming provider label
    merged_keys = [m[0] for m in merged]

    #the actual dataframes
    merged_dfs = [m[1] for m in merged]
    #     cols = [
    #         'bytes_sr_ratio',
    #         'count_sr_ratio',
    #         'smoothed_mean_delay_10s',
    #         'smoothed_mean_delay_60s',
    #         'received_mean_size',
    #         'sent_mean_size',
    #         'sent_large_prop',
    #         'sent_small_prop',
    #         'received_large_prop',
    #         'received_small_prop',
    #         'sent_longest_streak',
    #         'received_longest_streak',
    #         'max_frequency_prominence',

    #         'provider'
    #     ]

    cols = [
        'smoothed_mean_delay_10s', 'smoothed_mean_delay_60s',
        'received_mean_size', 'sent_mean_size', 'sent_small_prop',
        'received_large_prop', 'received_small_prop',
        'max_frequency_prominence', 'small_packet_ratio',
        'medium_packet_ratio', 'large_packet_ratio', 'download_bytes_cv',
        'upload_bytes_cv', 'provider'
    ]

    args = [(merged_dfs[i], merged_keys[i], rolling_window_1, rolling_window_2,
             resample_rate, frequency) for i in range(len(merged_dfs))]

    workers = multiprocessing.cpu_count()
    # print(f'Starting a processing pool of {workers} workers.')
    logging.info(f'Starting a processing pool of {workers} workers')
    start = time.time()
    pool = multiprocessing.Pool(processes=workers)
    results = pool.map(_engineer_features, args)
    # print(f'Time elapsed: {round(time.time() - start)} seconds.')
    logging.info(f'Time elapsed: {round(time.time() - start)} seconds.')

    features = np.vstack(list(filter(lambda x: x is not None, results)))

    features_df = pd.DataFrame(features, columns=cols).dropna()
    # print(f'{features_df.shape[0]} chunks of data feature engineered.')
    logging.info(f'{features_df.shape[0]} chunks of data feature engineered.')

    features_df.to_csv(os.path.join(out_dir, out_file), index=False)
    # print('Features created: ', list(features_df.columns))
    logging.info(f'Features created: {list(features_df.columns)}')