def job_scripts_features(data_folder, imgs_folder, args): """"Create job scripts for feature extraction. Check whether features have been already computed. If they haven't, create a job script for each dataset-descriptor pair. Parameters ---------- data_folder : string Full path of the folder where data are saved. imgs_folder : string Full path of the folder where texture datasets are stored. args : argparse.Namespace Command line arguments. """ print('Generating scripts for feature extraction...\n') count = 0 for dat in gen_datasets(imgs_folder, args.dataset): dat_id = dat.acronym for descr in gen_descriptors(args): for rad in descr.radius: descr_rad = copy.deepcopy(descr) descr_rad.radius = [rad] descr_rad_id = descr_rad.abbrev() feats_path = utils.filepath(data_folder, dat_id, descr_rad_id) if not os.path.isfile(feats_path): count += 1 job_script(dat, descr_rad, args, count, action='ef')
def read_score(folder, dat_id, descr_id, clf_id): """Read test scores from a file and compute the average value Parameters ---------- folder : string Full path of the folder where data are saved. dat_id : string Short name of a dataset. descr_id : string Short name of a descriptor. clf_id : string Short name of a classifier. Returns ------- ts_avg : float Average of test scores. """ result_path = utils.filepath(folder, dat_id, descr_id, clf_id) if os.path.isfile(result_path): result = utils.load_object(result_path) test_scores = [ts for _, ts in result] ts_avg = 100 * np.mean(test_scores) return ts_avg else: return None
def delete_one_file(path_args): """Delete a single file Parameters ---------- path_args : sequence of str Components that make up the full path of the file to be deleted. """ fname = utils.filepath(*path_args) utils.attempt_to_delete_file(fname)
def generate_epub(book_path, base_filename, tex_file): epub_file = filepath(book_path, base_filename, 'epub') cmd = ['pandoc', '--from=latex', '-o', epub_file, tex_file] proc = subprocess.Popen(cmd) proc.communicate() retcode = proc.returncode if not retcode == 0: os.unlink(epub_file) raise ValueError('Error {} executing command: {}'.format(retcode, ' '.join(cmd))) else: show_file(epub_file)
def generate_pdf(book_path, base_filename, tex_file): pdf_file = filepath(book_path, base_filename, 'pdf') cmd = ['pdflatex', '-interaction', 'nonstopmode', '-output-directory', book_path, tex_file] proc = subprocess.Popen(cmd) proc.communicate() proc = subprocess.Popen(cmd) proc.communicate() retcode = proc.returncode if not retcode == 0: os.unlink(pdf_file) raise ValueError('Error {} executing command: {}'.format(retcode, ' '.join(cmd))) else: if os.path.isfile(filepath(book_path, base_filename, 'toc')): os.unlink(filepath(book_path, base_filename, 'toc')) os.unlink(filepath(book_path, base_filename, 'log')) os.unlink(filepath(book_path, base_filename, 'aux')) show_file(pdf_file) return pdf_file
def get_features(folder, dataset, descriptor): """Return texture features for a single dataset and descriptor. Parameters ---------- folder : string Full path of the folder where data are saved. dataset : texdata.TextureDataset Object that encapsulates data of a texture dataset. descriptor : hep.HEP Object that encapsulates data of a texture descriptor. Returns ------- X : array Texture features. The number of rows is equal to the number of samples and the number of columns is equal to the dimensionality of the feature space. If an error occurs within the call to `apply_descriptor`, returns None. """ multiscale_features = [] dataset_id = dataset.acronym for rad in descriptor.radius: descr_single = copy.deepcopy(descriptor) descr_single.radius = [rad] descr_single_id = descr_single.abbrev() feat_path = utils.filepath(folder, dataset_id, descr_single_id) if os.path.isfile(feat_path): X = utils.load_object(feat_path) else: print(f'Computing {dataset_id}--{descr_single_id}') if hasattr(descr_single, 'components'): X = concatenate_feats(folder, dataset, descr_single) else: X = apply_descriptor(dataset, descr_single) if X is not None: utils.save_object(X, feat_path) else: break multiscale_features.append(X) else: X = np.concatenate(multiscale_features, axis=-1) return X
def concatenate_feats(data_folder, dataset, descriptor): """Compute features through concatenation of texture models. Parameters ---------- data_folder : str Full path of the folder where data are saved. dataset : texdata.TextureDataset Object that encapsulates data of a texture dataset. descriptor : hep.HEP Object that encapsulates data of a texture descriptor. Returns ------- X : array Computed features. The number of rows is equal to the number of samples and the number of columns is equal to the sum of the dimensionalities of the concatenated texture models. If an error occurs in the call to `apply_descriptor`, it returns `None`. """ dat_id = dataset.acronym params = {k: v for k, v in descriptor.__dict__.items()} feats = [] for component in descriptor.components: descr = component(**params) descr_id = descr.abbrev() feat_path = utils.filepath(data_folder, dat_id, descr_id) if os.path.isfile(feat_path): X = utils.load_object(feat_path) else: X = apply_descriptor(dataset, descr) if X is not None: utils.save_object(X, feat_path) else: break feats.append(X) else: X = np.concatenate(feats, axis=-1) return X
def extract_features(data_folder, imgs_folder, args): """"Compute texture features. Check whether features have been already computed. If they haven't, extract features from each dataset using each descriptor in `args` and save them to disk. If the descriptor is multi-scale, a separate file is created for each single value of the radius. Parameters ---------- data_folder : string Full path of the folder where data are saved. imgs_folder : string Full path of the folder where texture datasets are stored. args : argparse.Namespace Command line arguments. """ utils.boxed_text('Extracting features...', symbol='*') for dat in gen_datasets(imgs_folder, args.dataset): dat_id = dat.acronym for descr in gen_descriptors(args): for rad in descr.radius: descr_rad = copy.deepcopy(descr) descr_rad.radius = [rad] descr_rad_id = descr_rad.abbrev() feat_path = utils.filepath(data_folder, dat_id, descr_rad_id) if os.path.isfile(feat_path): print(f'Found {dat_id}--{descr_rad_id}', flush=True) else: print(f'Computing {dat_id}--{descr_rad_id}', flush=True) if hasattr(descr_rad, 'components'): X = concatenate_feats(data_folder, dat, descr_rad) else: X = apply_descriptor(dat, descr_rad) if X is not None: utils.save_object(X, feat_path) del X
def job_scripts_results(data_folder, imgs_folder, args, estimators): """"Create job scripts to compute classification results. Check whether results have been already computed. If they haven't, create a job script for each dataset-descriptor pair. Parameters ---------- data_folder : string Full path of the folder where data are saved. imgs_folder : string Full path of the folder where texture datasets are stored. args : argparse.Namespace Command line arguments. estimators : list of tuples Each tuple consist in a classifier (such as nearest neighbour, support vector machine, etc.) and the parameters used for optimization through `GridSearch`. """ print('Generating job scripts for classification...\n') count = 0 for clf, param_grid in estimators: clf_id = ''.join( [letter for letter in clf.__name__ if letter.isupper()]) for dat in gen_datasets(imgs_folder, args.dataset): dat_id = dat.acronym for descr in gen_descriptors(args): descr_id = descr.abbrev() result_path = utils.filepath( data_folder, dat_id, descr_id, clf_id) if not os.path.isfile(result_path): count += 1 job_script(dat, descr, args, count, action='c')
else: text_files = [f for f in glob.glob(os.path.join(book_path, '*.txt')) if not f.endswith('words.txt')] if text_files: VARS['CONTENT'] = latex_single(text_files[0], split_paragraphs, VARS['sections'], VARS['new_page_before_sections']) sep_path = os.path.join(book_path, 'words.txt') if os.path.isfile(sep_path): with open(sep_path, 'r') as f: hyphenation = '' for word in f.readlines(): hyphenation += latex_hyphenation(word.strip()) VARS['HYPHENATION'] = hyphenation TEMPLATE = 'template.tex' template = latex_env.get_template(TEMPLATE) base_filename = VARS['BASE_FILENAME'] tex_file = filepath(book_path, base_filename, 'tex') with open(tex_file, 'w') as f: f.write(template.render(**VARS)) if not args.only_tex: if args.pdf or not args.epub: pdf_file = generate_pdf(book_path, base_filename, tex_file) if args.booklet: generate_booklet(pdf_file, filepath(book_path, base_filename, 'booklet.pdf')) if args.epub: generate_epub(book_path, base_filename, tex_file)
def classify(data_folder, imgs_folder, args, estimators, test_size, n_tests, n_folds, random_state): """Compute classification results. Check whether features have been already classified. If not, perform classification using each estimator for each dataset and descriptor, and save results to disk. Parameters ---------- data_folder : string Full path of the folder where data are saved. imgs_folder : string Full path of the folder where texture datasets are stored. args : argparse.Namespace Command line arguments. estimators : list of tuples Each tuple consist in a classifier (such as nearest neighbour, support vector machine, etc.) and the parameters used for optimization through `GridSearch`. test_size : float Proportion (between 0.0 and 1.0) of the dataset to include in the test split. n_tests : int Number of reshuffling and splitting operations. n_folds : int Number of folds used for cross-validation. Must be at least 2. random_state : int Seed for the random number generator. This affects the splits into train and test, and the cross-validation folds. """ utils.boxed_text('Classifying...', symbol='*') for clf, param_grid in estimators: clf_id = ''.join( [letter for letter in clf.__name__ if letter.isupper()]) for dat in gen_datasets(imgs_folder, args.dataset): dat_id = dat.acronym for descr in gen_descriptors(args): descr_id = descr.abbrev() result_path = utils.filepath( data_folder, dat_id, descr_id, clf_id) if os.path.isfile(result_path): print(f'Loading {dat_id}--{descr_id}--{clf_id}', flush=True) result = utils.load_object(result_path) else: X = get_features(data_folder, dat, descr) if X is None: print(f'Skipping {dat_id}--{descr_id}--{clf_id}', flush=True) continue print(f'Computing {dat_id}--{descr_id}--{clf_id}', flush=True) y = dat.labels np.random.seed(random_state) random_states = np.random.randint(size=n_tests, low=0, high=1000) # It is essential to pass a different `rstate` to # `grid_search_cv` for each grid search. Otherwise # data are split into train and test always the same # way and as a consequence, the results returned by # `grid_search_cv` are identical. result = [grid_search_cv(X, y, clf, param_grid, n_folds, test_size, rs) for rs in random_states] utils.save_object(result, result_path) best_scores = [g.best_score_ for g, _ in result] test_scores = [ts for _, ts in result] print(f'Mean best cv score: {100*np.mean(best_scores):.2f}%') print(f'Mean test score: {100*np.mean(test_scores):.2f}%\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--BASE_FILENAME', default='default') parser.add_argument('book_path', help='Carpeta con archivos para un libro.', metavar='carpeta') parser.add_argument('--no-split', help='No separar párrafos.', action='store_true') parser.add_argument('--pdf', help='Genera la versión pdf del libro.', action='store_true') parser.add_argument('--booklet', help='Genera la versión booklet del pdf.', action='store_true') parser.add_argument('--epub', help='Genera la versión epub del libro.', action='store_true') parser.add_argument('--only-tex', help='Solo genera el archivo latex.', action='store_true') parser.add_argument( '--sections', help='Usar secciones en lugar de capítulos como elemento principal.', action='store_true') parser.add_argument( '--new-page-before-sections', help='Forzar página nueva en las secciones principales.', action='store_true') parser.add_argument('--TITLE', default='TITLE') parser.add_argument('--SUBTITLE', default='') parser.add_argument('--AUTHOR', default='AUTHOR') parser.add_argument('--FONT_SIZE', default=11) parser.add_argument('--PAGE_SIZE', default='a5paper') parser.add_argument('--YEAR', default=datetime.now().year) parser.add_argument('--URL', default='') parser.add_argument('--exclude-index', action='store_true') args = parser.parse_args() book_path = args.book_path class EmptyConfig(object): pass if not os.path.isdir(book_path): print('El argumento debe ser un directorio') exit() config_file = os.path.join(book_path, 'config.py') if os.path.isfile(config_file): config = imp.load_source('config', config_file) else: config = EmptyConfig() config.CONFIGS = {} VARS = DEFAULTS.copy() VARS.update(config.CONFIGS) for k, v in args._get_kwargs(): if not VARS.get(k): VARS[k] = v index_path = os.path.join(book_path, 'index.txt') split_paragraphs = not VARS['no_split'] if os.path.isfile(index_path): with open(index_path, 'r') as f: content = '' for filename in f.readlines(): if VARS['no_split']: content += latex_chapter( os.path.join(book_path, filename).strip(), split_paragraphs) else: content += latex_single( os.path.join(book_path, filename).strip(), split_paragraphs, VARS['sections'], VARS['new_page_before_sections']) VARS['CONTENT'] = content else: text_files = [ f for f in glob.glob(os.path.join(book_path, '*.txt')) if not f.endswith('words.txt') ] if text_files: VARS['CONTENT'] = latex_single(text_files[0], split_paragraphs, VARS['sections'], VARS['new_page_before_sections']) sep_path = os.path.join(book_path, 'words.txt') if os.path.isfile(sep_path): with open(sep_path, 'r') as f: hyphenation = '' for word in f.readlines(): hyphenation += latex_hyphenation(word.strip()) VARS['HYPHENATION'] = hyphenation TEMPLATE = 'template.tex' local_template_path = os.path.join(book_path, 'template.tex') if os.path.isfile(local_template_path): template = latex_env.from_string(open(local_template_path).read()) else: template = latex_env.get_template(TEMPLATE) base_filename = VARS['BASE_FILENAME'] tex_file = filepath(book_path, base_filename, 'tex') with open(tex_file, 'w') as f: f.write(template.render(**VARS)) if not args.only_tex: if args.pdf or not args.epub: pdf_file = generate_pdf(book_path, base_filename, tex_file) if args.booklet: generate_booklet( pdf_file, filepath(book_path, base_filename, 'booklet.pdf')) if args.epub: generate_epub(book_path, base_filename, tex_file)
def generate_latex(args): ''' !!! Missing docstring.''' def get_max_val(lst): ''' !!! Missing docstring.''' flattened = [] for item in lst: if isinstance(item, (int, float)): flattened.append(item) elif isinstance(item, list): flattened.extend(item) return max(flattened) if flattened else None # Load settings dbtex, imdescr = load_settings(args, config.IMGS) # Display information utils.display_sequence(dbtex, 'Datasets', symbol='-') utils.display_sequence(imdescr, 'Descriptors', symbol='-') utils.display_sequence( [est[0].__name__ for est in config.ESTIMATORS], 'Classifiers', symbol='-') utils.display_message('Generating LaTeX code', symbol='*') # Generate introductory sections code = introduction_validation( config.DATA, config.N_TESTS, config.TEST_SIZE, config.N_FOLDS) code += introduction_classifiers(config.ESTIMATORS) code += introduction_dimensions(imdescr) code += introduction_parameters() code = [code] # `sects`: names of the used descriptors (sorted alphabetically) sections = sorted(set(d.__class__.__name__ for d in imdescr)) # # !!! Refactoring required for s in sections: # `s`: section title (is a descriptor name) code.append(r'\section*{{{0}}}'.format(s)) tups = [tuple(d.radius) for d in imdescr if d.__class__.__name__ == s] # `rlst`: radii considered for descriptor `s` rlst = [*map(list, sorted(set(tups), key=lambda x: (len(x), x)))] # `osect`: orders considered for descriptor `s` #osect = sorted(set(d.order for d in imdescr if d.__class__.__name__ == s), key=lambda x: len(x)) osect = hep._orders for k, (clf, params) in enumerate(config.ESTIMATORS): if k > 0: code.append(r'\newpage') code.append(r'\subsection*{{{0}}}'.format(clf.__name__)) code.append(r'\begin{{longtable}}{{ll{0}}}'.format('r'*len(osect))) heading = r' & '.join([o.capitalize() for o in osect]) code.append(r'Dataset & Radius & {0} \\'.format(heading)) code.append(r'\hline') for db in dbtex: for i, r in enumerate(rlst): if i == 0: line = r'{0} & {1} '.format(db, r) else: line = r' & {0} '.format(r) vals = [] for o in osect: #for o in hep._orders: # `same`: list of descriptors with the same name, radius and order same = [d for d in imdescr if d.__class__.__name__ == s and d.radius == r and d.order == o] if not same: #print('Not required: {}--{}--{}--{}--{}'.format(db, s, r, o, clf.__name__)) vals.append(None) elif len(same) == 1: result_path = utils.filepath(config.DATA, db, same[0], clf) if os.path.isfile(result_path): #print('Reading single: ', result_path) result = utils.load_object(result_path) acc = 100*np.mean([ts for g, ts in result]) vals.append(acc) else: print('Not found (single): ', result_path) vals.append(None) else: accs = [] for descr in same: result_path = utils.filepath(config.DATA, db, descr, clf) if os.path.isfile(result_path): #print('Reading multiple: ', result_path) result = utils.load_object(result_path) accs.append(100*np.mean([ts for g, ts in result])) else: print('Not found (multi): ', result_path) if not accs: vals.append(None) elif len(accs) == 1: vals.append(accs[0]) elif len(accs) > 1: vals.append([np.min(accs), np.max(accs)]) for v in vals: maxval = get_max_val(vals) if v is None: line += r'& ' elif isinstance(v, (int, float)): if v == maxval: line += r'& \bfseries{{{0:.1f}}} '.format(v) else: line += r'& {0:.1f} '.format(v) elif isinstance(v, list): if v[1] == maxval: line += r'& \bfseries{{{0:.1f}--{1:.1f}}} '.format(v[0], v[1]) else: line += r'& {0:.1f}--{1:.1f} '.format(v[0], v[1]) line += r'\\' code.append(line) code.append(r'\end{longtable}') code.append(r'\newpage') code.append(r'\end{document}') latex_path = os.path.join(config.DATA, 'results.tex') with open(latex_path, 'w') as fid: fid.write('\n'.join(code))
# ensure_dir_exists(destination) # download_KylbergSintorn(destination, x=4, y=4) def job_script(folder, job_id, partition, datasets, descriptors, estimators=None)#data, loops): """ !!! """ for dat, descr in itertools.product(datasets, descriptors): dat_id = dat.acronym for rad in descr.radius: descr_single = copy.deepcopy(descr) descr_single.radius = [rad] descr_single_id = descr_single.abbrev() feat_args = [folder, dat_id, descr_single_id] if estimators is None: this_one = utils.filepath(*feat_args) else: for clf, _ in estimators: res_args = feat_args + [clf.__name__] this_one = utils.filepath(*res_args) datasets, descriptors, estimators = loops utils.display_sequence(datasets, 'Datasets', symbol='-') utils.display_sequence(descriptors, 'Descriptors', symbol='-') utils.display_sequence( [est[0].__name__ for est in estimators], 'Classifiers', symbol='-') utils.display_message('Creating arguments file and job script', symbol='*')