Example #1
0
def dom(input_files, output_files, height, depth, num_workers):
    """Extract the dom features and output them to a directory, in a partitioned fashion.

    INPUT_FILES can be a glob pattern to either a bunch of csvs containing "html,url" or
    the html files themselves. their filename will be used as url in that case("file://filename").

    OUTPUT_FILES names the pattern of the CSV files where to output the features.
    """
    dask.set_options(get=dask.multiprocessing.get, num_workers=num_workers)  # set the number of workers

    # must read as pandas because dask makes a fuss about html
    html_df = pd.read_csv(input_files)  # df of 'html'/'url'
    feats = extract_features_from_df(html_df, depth=depth, height=height, num_workers=num_workers)

    # output all the three to csvs
    logger.info('Outputting features')
    feats.to_csv(output_files, index=False)

    logger.info('DONE!')
Example #2
0
def init_datasets(destination, num_workers):
    """Download and convert Cleaneval and Dragnet datasets in DESTINATION_DIR"""
    if not os.path.exists(destination):
        logger.info('Path does not exist - creating')
        os.makedirs(destination)

    # get script location
    script_path = pkg_resources.resource_filename(__name__, 'prepare_data.sh')
    logger.info('Beginning download')  # runs subscript
    subprocess.run(['bash', script_path, destination, str(num_workers)])

    # finished
    logger.info('Done')
Example #3
0
def merge(cache, output_files, input_files, on):
    """Merges the given files on the columns specified withthe --on option
    and outputs the result to output_files."""
    # set the cache if specified
    if cache is not None:
        logger.info('Using {} as cache'.format(cache))
        dask.set_options(temporary_directory=cache)

    on_columns = on.split(',')  # get the columns to merge on
    result_ddf = dd.read_csv(input_files[0])  # the first one
    for in_files in input_files[1:]:
        # merge with the others
        logger.info('Merging {}'.format(in_files))
        in_file_ddf = dd.read_csv(in_files)
        result_ddf = result_ddf.merge(in_file_ddf, on=on_columns)

    # output it
    logger.info('Outputting')
    result_ddf.to_csv(output_files, index=False)

    logger.info('Done')
Example #4
0
def convert(dataset_directory, output_directory, raw, labels, num_workers,
            cleaneval, blocks):
    """Converts the dataset from DATASET_DIRECTORY to our format and
    outputs it to OUTPUT_DIRECTORY"""
    html_ddf, label_ddf = convert_dataset(
        dataset_directory,
        'dragnet-' if not cleaneval else 'cleaneval-',
        cleaneval=cleaneval,
        return_extracted_blocks=blocks)

    dask.set_options(get=dask.multiprocessing.get,
                     num_workers=num_workers)  # set the number of workers
    if raw:
        # output the html
        logger.info('Outputting raw')
        html_ddf.compute().to_csv(output_directory + '/raw.csv', index=False)
    if labels:
        # output the html
        logger.info('Outputting labels')
        label_ddf.compute().to_csv(output_directory + '/labels.csv',
                                   index=False)

    logger.info('Done!')
Example #5
0
def nested_cv(estimator,
              X,
              y,
              groups=None,
              param_distributions=None,
              n_iter=20,
              internal_n_folds=5,
              internal_total_folds=None,
              external_n_folds=5,
              external_total_folds=None,
              n_jobs=-1,
              scoring='f1'):
    """Perform nested cv with internal randomized CV for model selection
    Given a dataset with optional grouping, a parameter distribution for an estimator
    perform nested CV.

    The model selection is done in the internal loop which consists in a sample
    of folds from a given total(default is to use all folds - but you can basically
    use it as a split). The score is then calculated for each individual fold
    and the returned values are a list of scores and a dataframe containing all internal
    CV results.
    """
    # convert groups and labels
    if isinstance(y, pd.Series):
        y = y.values
    if isinstance(groups, pd.Series):
        groups = groups.values

    # select defaults
    if groups is None:
        groups = np.arange(y.shape[0])
    if internal_total_folds is None:
        internal_total_folds = internal_n_folds
    if external_total_folds is None:
        external_total_folds = external_n_folds

    # get the external splits
    splits = generate_grouped_splits(X,
                                     y,
                                     groups,
                                     total_folds=external_total_folds,
                                     n_folds=external_n_folds)

    # list in which to store all cv results
    all_cv_results = []

    # get the scorer class for the metrics
    scorer = get_scorer(scoring)
    scores = np.zeros(external_n_folds, dtype='float32')

    for run_nb, split in zip(range(external_n_folds), splits):
        logger.info('Model selection on fold number {}...'.format(run_nb))

        # split the dataset
        if isinstance(X, pd.DataFrame):
            X_train, X_test = X.iloc[split[0]], X.iloc[split[1]]
        else:
            X_train, X_test = X[split[0]], X[split[1]]
        y_train, y_test = y[split[0]], y[split[1]]
        groups_train, groups_test = groups[split[0]], groups[split[1]]

        # do the internal loop, pass the corresponding groups
        best_params, cv_results = search_params(
            estimator,
            X_train,
            y_train,
            groups=groups_train,
            param_distributions=param_distributions,
            n_iter=n_iter,
            n_folds=internal_n_folds,
            total_folds=internal_total_folds,
            n_jobs=n_jobs,
            scoring=scoring)

        # refit the the best estimator with all the data
        logger.info('Refitting estimator with best params...')
        best_est = estimator
        best_est.set_params(**best_params)  # set as kwargs!
        best_est.fit(X_train, y_train)

        # add the score to the list of all scores
        scores[run_nb] = scorer(best_est, X_test, y_test)

        # log the result
        logger.info('SCORE FOR BEST ESTIMATOR ON FOLD NUMBER {} = {}'.format(
            run_nb, scores[run_nb]))

        # add the cross validation dataframe to the list
        cv_results['run_nb'] = run_nb
        all_cv_results.append(cv_results)

    return scores, pd.concat(all_cv_results, ignore_index=True)
Example #6
0
def train(dataset, output, external_folds, internal_folds,
          n_iter, n_jobs, random_seed, param_file, model_file,
          cli_params, shuffle):
    """Trains a model over a dataset, given a set of values of parameters to use for
    the CV. Parameters used:

    """
    params = {}
    # attempt to read params from file
    if param_file is not None:
        params = json.load(param_file)

    for param in cli_params:
        # load the values from the json
        key, val = param
        loaded_val = json.loads(val)
        params[key] = loaded_val

    logger.debug('Passing params:\n{}'.format(pprint.pformat(params)))
    # extract the params
    blocks_only = params.pop('blocks_only', True)  # use only the blocks

    # load the dataset
    logger.info('Loading the dataset')
    X, y, groups = get_ordered_dataset(dataset, blocks_only=blocks_only, shuffle=shuffle)

    """Evaluate the expected f1-score with nested CV"""
    # unpacking the fold numbers
    internal_n_folds, internal_total_folds = internal_folds
    external_n_folds, external_total_folds = external_folds

    # seed the random number generator
    logger.info('Seeding the random number generator')
    np.random.seed(random_seed)
    # there is no other solution than using tf just in the worker
    # tf.set_random_seed(random_seed)

    # load the estimator
    estimator, param_distributions = get_param_grid(**params)  # get the appropriate
    logger.debug('Computed params(after default values):\n{}'.format(pprint.pformat(param_distributions)))

    # properly format params. wrap them if lists if necessary
    # rv_frozen makes an exception because it is a scipy distribution
    param_distributions = dict(
        map(lambda p: (p[0], p[1] if isinstance(p[1], list) or isinstance(p[1], rv_frozen) else [p[1]]),
            param_distributions.items()))

    # output the scores only if specified
    if output is not None:
        # training the model
        logger.info('Performing nested CV')
        scores, cv = nested_cv(estimator, X, y, groups, param_distributions=param_distributions, n_iter=n_iter,
                               internal_n_folds=internal_n_folds, internal_total_folds=internal_total_folds,
                               external_n_folds=external_n_folds, external_total_folds=external_total_folds,
                               n_jobs=n_jobs)

        # outputting
        logger.info('Saving the results')
        output_scores = output.format(suffix='scores.csv')
        output_cv = output.format(suffix='cv.csv')

        np.savetxt(output_scores, scores)
        cv.to_csv(output_cv, index=False)

    # train the model on the whole dataset only if model_file
    # is specified
    if model_file is not None:
        logger.info('Training the model over the entire dataset')
        trained_est = cv_train(estimator, X, y, groups,
                               param_distributions=param_distributions,
                               n_iter=n_iter, n_folds=external_n_folds,
                               total_folds=external_total_folds, n_jobs=n_jobs)

        # save the estimator in pickle
        logger.info('Saving the model')
        with open(model_file, 'wb') as f:
            pickle.dump(trained_est, f)  # pickle the file

    logger.info('DONE')