def dom(input_files, output_files, height, depth, num_workers): """Extract the dom features and output them to a directory, in a partitioned fashion. INPUT_FILES can be a glob pattern to either a bunch of csvs containing "html,url" or the html files themselves. their filename will be used as url in that case("file://filename"). OUTPUT_FILES names the pattern of the CSV files where to output the features. """ dask.set_options(get=dask.multiprocessing.get, num_workers=num_workers) # set the number of workers # must read as pandas because dask makes a fuss about html html_df = pd.read_csv(input_files) # df of 'html'/'url' feats = extract_features_from_df(html_df, depth=depth, height=height, num_workers=num_workers) # output all the three to csvs logger.info('Outputting features') feats.to_csv(output_files, index=False) logger.info('DONE!')
def init_datasets(destination, num_workers): """Download and convert Cleaneval and Dragnet datasets in DESTINATION_DIR""" if not os.path.exists(destination): logger.info('Path does not exist - creating') os.makedirs(destination) # get script location script_path = pkg_resources.resource_filename(__name__, 'prepare_data.sh') logger.info('Beginning download') # runs subscript subprocess.run(['bash', script_path, destination, str(num_workers)]) # finished logger.info('Done')
def merge(cache, output_files, input_files, on): """Merges the given files on the columns specified withthe --on option and outputs the result to output_files.""" # set the cache if specified if cache is not None: logger.info('Using {} as cache'.format(cache)) dask.set_options(temporary_directory=cache) on_columns = on.split(',') # get the columns to merge on result_ddf = dd.read_csv(input_files[0]) # the first one for in_files in input_files[1:]: # merge with the others logger.info('Merging {}'.format(in_files)) in_file_ddf = dd.read_csv(in_files) result_ddf = result_ddf.merge(in_file_ddf, on=on_columns) # output it logger.info('Outputting') result_ddf.to_csv(output_files, index=False) logger.info('Done')
def convert(dataset_directory, output_directory, raw, labels, num_workers, cleaneval, blocks): """Converts the dataset from DATASET_DIRECTORY to our format and outputs it to OUTPUT_DIRECTORY""" html_ddf, label_ddf = convert_dataset( dataset_directory, 'dragnet-' if not cleaneval else 'cleaneval-', cleaneval=cleaneval, return_extracted_blocks=blocks) dask.set_options(get=dask.multiprocessing.get, num_workers=num_workers) # set the number of workers if raw: # output the html logger.info('Outputting raw') html_ddf.compute().to_csv(output_directory + '/raw.csv', index=False) if labels: # output the html logger.info('Outputting labels') label_ddf.compute().to_csv(output_directory + '/labels.csv', index=False) logger.info('Done!')
def nested_cv(estimator, X, y, groups=None, param_distributions=None, n_iter=20, internal_n_folds=5, internal_total_folds=None, external_n_folds=5, external_total_folds=None, n_jobs=-1, scoring='f1'): """Perform nested cv with internal randomized CV for model selection Given a dataset with optional grouping, a parameter distribution for an estimator perform nested CV. The model selection is done in the internal loop which consists in a sample of folds from a given total(default is to use all folds - but you can basically use it as a split). The score is then calculated for each individual fold and the returned values are a list of scores and a dataframe containing all internal CV results. """ # convert groups and labels if isinstance(y, pd.Series): y = y.values if isinstance(groups, pd.Series): groups = groups.values # select defaults if groups is None: groups = np.arange(y.shape[0]) if internal_total_folds is None: internal_total_folds = internal_n_folds if external_total_folds is None: external_total_folds = external_n_folds # get the external splits splits = generate_grouped_splits(X, y, groups, total_folds=external_total_folds, n_folds=external_n_folds) # list in which to store all cv results all_cv_results = [] # get the scorer class for the metrics scorer = get_scorer(scoring) scores = np.zeros(external_n_folds, dtype='float32') for run_nb, split in zip(range(external_n_folds), splits): logger.info('Model selection on fold number {}...'.format(run_nb)) # split the dataset if isinstance(X, pd.DataFrame): X_train, X_test = X.iloc[split[0]], X.iloc[split[1]] else: X_train, X_test = X[split[0]], X[split[1]] y_train, y_test = y[split[0]], y[split[1]] groups_train, groups_test = groups[split[0]], groups[split[1]] # do the internal loop, pass the corresponding groups best_params, cv_results = search_params( estimator, X_train, y_train, groups=groups_train, param_distributions=param_distributions, n_iter=n_iter, n_folds=internal_n_folds, total_folds=internal_total_folds, n_jobs=n_jobs, scoring=scoring) # refit the the best estimator with all the data logger.info('Refitting estimator with best params...') best_est = estimator best_est.set_params(**best_params) # set as kwargs! best_est.fit(X_train, y_train) # add the score to the list of all scores scores[run_nb] = scorer(best_est, X_test, y_test) # log the result logger.info('SCORE FOR BEST ESTIMATOR ON FOLD NUMBER {} = {}'.format( run_nb, scores[run_nb])) # add the cross validation dataframe to the list cv_results['run_nb'] = run_nb all_cv_results.append(cv_results) return scores, pd.concat(all_cv_results, ignore_index=True)
def train(dataset, output, external_folds, internal_folds, n_iter, n_jobs, random_seed, param_file, model_file, cli_params, shuffle): """Trains a model over a dataset, given a set of values of parameters to use for the CV. Parameters used: """ params = {} # attempt to read params from file if param_file is not None: params = json.load(param_file) for param in cli_params: # load the values from the json key, val = param loaded_val = json.loads(val) params[key] = loaded_val logger.debug('Passing params:\n{}'.format(pprint.pformat(params))) # extract the params blocks_only = params.pop('blocks_only', True) # use only the blocks # load the dataset logger.info('Loading the dataset') X, y, groups = get_ordered_dataset(dataset, blocks_only=blocks_only, shuffle=shuffle) """Evaluate the expected f1-score with nested CV""" # unpacking the fold numbers internal_n_folds, internal_total_folds = internal_folds external_n_folds, external_total_folds = external_folds # seed the random number generator logger.info('Seeding the random number generator') np.random.seed(random_seed) # there is no other solution than using tf just in the worker # tf.set_random_seed(random_seed) # load the estimator estimator, param_distributions = get_param_grid(**params) # get the appropriate logger.debug('Computed params(after default values):\n{}'.format(pprint.pformat(param_distributions))) # properly format params. wrap them if lists if necessary # rv_frozen makes an exception because it is a scipy distribution param_distributions = dict( map(lambda p: (p[0], p[1] if isinstance(p[1], list) or isinstance(p[1], rv_frozen) else [p[1]]), param_distributions.items())) # output the scores only if specified if output is not None: # training the model logger.info('Performing nested CV') scores, cv = nested_cv(estimator, X, y, groups, param_distributions=param_distributions, n_iter=n_iter, internal_n_folds=internal_n_folds, internal_total_folds=internal_total_folds, external_n_folds=external_n_folds, external_total_folds=external_total_folds, n_jobs=n_jobs) # outputting logger.info('Saving the results') output_scores = output.format(suffix='scores.csv') output_cv = output.format(suffix='cv.csv') np.savetxt(output_scores, scores) cv.to_csv(output_cv, index=False) # train the model on the whole dataset only if model_file # is specified if model_file is not None: logger.info('Training the model over the entire dataset') trained_est = cv_train(estimator, X, y, groups, param_distributions=param_distributions, n_iter=n_iter, n_folds=external_n_folds, total_folds=external_total_folds, n_jobs=n_jobs) # save the estimator in pickle logger.info('Saving the model') with open(model_file, 'wb') as f: pickle.dump(trained_est, f) # pickle the file logger.info('DONE')