Beispiel #1
0
def job_scripts_features(data_folder, imgs_folder, args):
    """"Create job scripts for feature extraction.
    
    Check whether features have been already computed. If they haven't, 
    create a job script for each dataset-descriptor pair.

    Parameters
    ----------
    data_folder : string
        Full path of the folder where data are saved.
    imgs_folder : string
        Full path of the folder where texture datasets are stored.
    args : argparse.Namespace
        Command line arguments.
        
    """
    print('Generating scripts for feature extraction...\n')

    count = 0

    for dat in gen_datasets(imgs_folder, args.dataset):
        dat_id = dat.acronym
        for descr in gen_descriptors(args):
            for rad in descr.radius:
                descr_rad = copy.deepcopy(descr)
                descr_rad.radius = [rad]
                descr_rad_id = descr_rad.abbrev()
                
                feats_path = utils.filepath(data_folder, dat_id, descr_rad_id)    
                if not os.path.isfile(feats_path):
                    count += 1
                    job_script(dat, descr_rad, args, count, action='ef')
Beispiel #2
0
def read_score(folder, dat_id, descr_id, clf_id):
    """Read test scores from a file and compute the average value

    Parameters
    ----------
    folder : string
        Full path of the folder where data are saved.
    dat_id : string
        Short name of a dataset.
    descr_id : string
        Short name of a descriptor.
    clf_id : string
        Short name of a classifier.
        
    Returns
    -------
    ts_avg : float
        Average of test scores.

    """
    result_path = utils.filepath(folder, dat_id, descr_id, clf_id)
    if os.path.isfile(result_path):
        result = utils.load_object(result_path)
        test_scores = [ts for _, ts in result]
        ts_avg = 100 * np.mean(test_scores)
        return ts_avg
    else:
        return None
Beispiel #3
0
def delete_one_file(path_args):
    """Delete a single file
    
    Parameters
    ----------
    path_args : sequence of str
        Components that make up the full path of the file to be deleted.
    
    """
    fname = utils.filepath(*path_args)
    utils.attempt_to_delete_file(fname)
Beispiel #4
0
def generate_epub(book_path, base_filename, tex_file):
    epub_file = filepath(book_path, base_filename, 'epub')
    cmd = ['pandoc', '--from=latex', '-o', epub_file, tex_file]

    proc = subprocess.Popen(cmd)
    proc.communicate()

    retcode = proc.returncode

    if not retcode == 0:
        os.unlink(epub_file)
        raise ValueError('Error {} executing command: {}'.format(retcode, ' '.join(cmd)))
    else:
        show_file(epub_file)
Beispiel #5
0
def generate_pdf(book_path, base_filename, tex_file):
    pdf_file = filepath(book_path, base_filename, 'pdf')
    cmd = ['pdflatex', '-interaction', 'nonstopmode', '-output-directory', book_path, tex_file]

    proc = subprocess.Popen(cmd)
    proc.communicate()

    proc = subprocess.Popen(cmd)
    proc.communicate()

    retcode = proc.returncode

    if not retcode == 0:
        os.unlink(pdf_file)
        raise ValueError('Error {} executing command: {}'.format(retcode, ' '.join(cmd)))
    else:
        if os.path.isfile(filepath(book_path, base_filename, 'toc')):
            os.unlink(filepath(book_path, base_filename, 'toc'))
        os.unlink(filepath(book_path, base_filename, 'log'))
        os.unlink(filepath(book_path, base_filename, 'aux'))
        show_file(pdf_file)

    return pdf_file
Beispiel #6
0
def get_features(folder, dataset, descriptor):
    """Return texture features for a single dataset and descriptor.

    Parameters
    ----------
    folder : string
        Full path of the folder where data are saved.
    dataset : texdata.TextureDataset
        Object that encapsulates data of a texture dataset.
    descriptor : hep.HEP
        Object that encapsulates data of a texture descriptor.

    Returns
    -------
    X : array
        Texture features. The number of rows is equal to the number of
        samples and the number of columns is equal to the dimensionality
        of the feature space. If an error occurs within the call to 
        `apply_descriptor`, returns None.
        
    """
    multiscale_features = []
    dataset_id = dataset.acronym
    for rad in descriptor.radius:
        descr_single = copy.deepcopy(descriptor)
        descr_single.radius = [rad]
        descr_single_id = descr_single.abbrev()
        feat_path = utils.filepath(folder, dataset_id, descr_single_id)
        if os.path.isfile(feat_path):
            X = utils.load_object(feat_path)
        else:
            print(f'Computing {dataset_id}--{descr_single_id}')

            if hasattr(descr_single, 'components'):
                X = concatenate_feats(folder, dataset, descr_single)
            else:
                X = apply_descriptor(dataset, descr_single)
            if X is not None:
                utils.save_object(X, feat_path)
            else:
                break
        multiscale_features.append(X)
    else:
        X = np.concatenate(multiscale_features, axis=-1)
    return X
Beispiel #7
0
def concatenate_feats(data_folder, dataset, descriptor):
    """Compute features through concatenation of texture models.

    Parameters
    ----------
    data_folder : str
        Full path of the folder where data are saved.
    dataset : texdata.TextureDataset
        Object that encapsulates data of a texture dataset.
    descriptor : hep.HEP
        Object that encapsulates data of a texture descriptor.
    
    Returns
    -------
    X : array
        Computed features. The number of rows is equal to the number of
        samples and the number of columns is equal to the sum of the 
        dimensionalities of the concatenated texture models. If an error 
        occurs in the call to `apply_descriptor`, it returns `None`.

    """
    dat_id = dataset.acronym
    params = {k: v for k, v in descriptor.__dict__.items()}
    feats = []

    for component in descriptor.components: 
        descr = component(**params)
        descr_id = descr.abbrev()
        feat_path = utils.filepath(data_folder, dat_id, descr_id)
        if os.path.isfile(feat_path):
            X = utils.load_object(feat_path)
        else:
            X = apply_descriptor(dataset, descr)
            if X is not None:
                utils.save_object(X, feat_path)
            else:
                break
        feats.append(X)
    else:
        X = np.concatenate(feats, axis=-1)

    return X
Beispiel #8
0
def extract_features(data_folder, imgs_folder, args):
    """"Compute texture features.
    
    Check whether features have been already computed. If they haven't, 
    extract features from each dataset using each descriptor in
    `args` and save them to disk. If the descriptor is multi-scale, 
    a separate file is created for each single value of the radius.

    Parameters
    ----------
    data_folder : string
        Full path of the folder where data are saved.
    imgs_folder : string
        Full path of the folder where texture datasets are stored.
    args : argparse.Namespace
        Command line arguments.
        
    """
    utils.boxed_text('Extracting features...', symbol='*')

    for dat in gen_datasets(imgs_folder, args.dataset):
        dat_id = dat.acronym
        for descr in gen_descriptors(args):
            for rad in descr.radius:
                descr_rad = copy.deepcopy(descr)
                descr_rad.radius = [rad]
                descr_rad_id = descr_rad.abbrev()
                feat_path = utils.filepath(data_folder, dat_id, descr_rad_id)
                if os.path.isfile(feat_path):
                    print(f'Found {dat_id}--{descr_rad_id}', flush=True)
                else:
                    print(f'Computing {dat_id}--{descr_rad_id}', flush=True)
                    if hasattr(descr_rad, 'components'):
                        X = concatenate_feats(data_folder, dat, descr_rad)
                    else:
                        X = apply_descriptor(dat, descr_rad)
                    if X is not None:
                        utils.save_object(X, feat_path)
                        del X
Beispiel #9
0
def job_scripts_results(data_folder, imgs_folder, args, estimators):
    """"Create job scripts to compute classification results.
    
    Check whether results have been already computed. If they haven't, 
    create a job script for each dataset-descriptor pair.

    Parameters
    ----------
    data_folder : string
        Full path of the folder where data are saved.
    imgs_folder : string
        Full path of the folder where texture datasets are stored.
    args : argparse.Namespace
        Command line arguments.
    estimators : list of tuples
        Each tuple consist in a classifier (such as nearest neighbour,
        support vector machine, etc.) and the parameters used for
        optimization through `GridSearch`.
        
    """
    print('Generating job scripts for classification...\n')

    count = 0

    for clf, param_grid in estimators:
        clf_id = ''.join(
                [letter for letter in clf.__name__ if letter.isupper()])
        for dat in gen_datasets(imgs_folder, args.dataset):
            dat_id = dat.acronym
            for descr in gen_descriptors(args):
                descr_id = descr.abbrev()                
                result_path = utils.filepath(
                        data_folder, dat_id, descr_id, clf_id)
                if not os.path.isfile(result_path):
                    count += 1
                    job_script(dat, descr, args, count, action='c')
Beispiel #10
0
else:
    text_files = [f for f in glob.glob(os.path.join(book_path, '*.txt')) if not f.endswith('words.txt')]
    if text_files:
        VARS['CONTENT'] = latex_single(text_files[0], split_paragraphs, VARS['sections'], VARS['new_page_before_sections'])

sep_path = os.path.join(book_path, 'words.txt')
if os.path.isfile(sep_path):
    with open(sep_path, 'r') as f:
        hyphenation = ''
        for word in f.readlines():
            hyphenation += latex_hyphenation(word.strip())
        VARS['HYPHENATION'] = hyphenation

TEMPLATE = 'template.tex'

template = latex_env.get_template(TEMPLATE)

base_filename = VARS['BASE_FILENAME']
tex_file = filepath(book_path, base_filename, 'tex')

with open(tex_file, 'w') as f:
    f.write(template.render(**VARS))

if not args.only_tex:
    if args.pdf or not args.epub:
        pdf_file = generate_pdf(book_path, base_filename, tex_file)
        if args.booklet:
            generate_booklet(pdf_file, filepath(book_path, base_filename, 'booklet.pdf'))
    if args.epub:
        generate_epub(book_path, base_filename, tex_file)
Beispiel #11
0
def classify(data_folder, imgs_folder, args, estimators, test_size, n_tests, 
             n_folds, random_state):
    """Compute classification results.
    
    Check whether features have been already classified. If not,
    perform classification using each estimator for each dataset and
    descriptor, and save results to disk.

    Parameters
    ----------
    data_folder : string
        Full path of the folder where data are saved.
    imgs_folder : string
        Full path of the folder where texture datasets are stored.
    args : argparse.Namespace
        Command line arguments.
    estimators : list of tuples
        Each tuple consist in a classifier (such as nearest neighbour,
        support vector machine, etc.) and the parameters used for
        optimization through `GridSearch`.
    test_size : float
        Proportion (between 0.0 and 1.0) of the dataset to
        include in the test split.
    n_tests : int
        Number of reshuffling and splitting operations.
    n_folds : int
        Number of folds used for cross-validation. Must be at least 2.
    random_state : int
        Seed for the random number generator. This affects the splits into 
        train and test, and the cross-validation folds.
        
    """
    utils.boxed_text('Classifying...', symbol='*')

    for clf, param_grid in estimators:
        clf_id = ''.join(
                [letter for letter in clf.__name__ if letter.isupper()])
        for dat in gen_datasets(imgs_folder, args.dataset):
            dat_id = dat.acronym
            for descr in gen_descriptors(args):
                descr_id = descr.abbrev()                
                result_path = utils.filepath(
                        data_folder, dat_id, descr_id, clf_id)
                if os.path.isfile(result_path):
                    print(f'Loading {dat_id}--{descr_id}--{clf_id}', 
                          flush=True)
                    result = utils.load_object(result_path)
                else:
                    X = get_features(data_folder, dat, descr)
                    if X is None:
                        print(f'Skipping {dat_id}--{descr_id}--{clf_id}', 
                              flush=True)
                        continue
                    print(f'Computing {dat_id}--{descr_id}--{clf_id}', 
                          flush=True)
                    y = dat.labels
                    np.random.seed(random_state)
                    random_states = np.random.randint(size=n_tests, low=0, 
                                                      high=1000)    
                    # It is essential to pass a different `rstate` to 
                    # `grid_search_cv` for each grid search. Otherwise  
                    # data are split into train and test always the same  
                    # way and as a consequence, the results returned by 
                    # `grid_search_cv` are identical.
                    result = [grid_search_cv(X, y, clf, param_grid, n_folds, 
                                             test_size, rs) 
                              for rs in random_states]
                    utils.save_object(result, result_path)
        
                best_scores = [g.best_score_ for g, _ in result]
                test_scores = [ts for _, ts in result]
                print(f'Mean best cv score: {100*np.mean(best_scores):.2f}%')
                print(f'Mean test score: {100*np.mean(test_scores):.2f}%\n')
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--BASE_FILENAME', default='default')
    parser.add_argument('book_path',
                        help='Carpeta con archivos para un libro.',
                        metavar='carpeta')
    parser.add_argument('--no-split',
                        help='No separar párrafos.',
                        action='store_true')
    parser.add_argument('--pdf',
                        help='Genera la versión pdf del libro.',
                        action='store_true')
    parser.add_argument('--booklet',
                        help='Genera la versión booklet del pdf.',
                        action='store_true')
    parser.add_argument('--epub',
                        help='Genera la versión epub del libro.',
                        action='store_true')
    parser.add_argument('--only-tex',
                        help='Solo genera el archivo latex.',
                        action='store_true')
    parser.add_argument(
        '--sections',
        help='Usar secciones en lugar de capítulos como elemento principal.',
        action='store_true')
    parser.add_argument(
        '--new-page-before-sections',
        help='Forzar página nueva en las secciones principales.',
        action='store_true')
    parser.add_argument('--TITLE', default='TITLE')
    parser.add_argument('--SUBTITLE', default='')
    parser.add_argument('--AUTHOR', default='AUTHOR')
    parser.add_argument('--FONT_SIZE', default=11)
    parser.add_argument('--PAGE_SIZE', default='a5paper')
    parser.add_argument('--YEAR', default=datetime.now().year)
    parser.add_argument('--URL', default='')
    parser.add_argument('--exclude-index', action='store_true')
    args = parser.parse_args()
    book_path = args.book_path

    class EmptyConfig(object):
        pass

    if not os.path.isdir(book_path):
        print('El argumento debe ser un directorio')
        exit()
    config_file = os.path.join(book_path, 'config.py')
    if os.path.isfile(config_file):
        config = imp.load_source('config', config_file)
    else:
        config = EmptyConfig()
        config.CONFIGS = {}

    VARS = DEFAULTS.copy()
    VARS.update(config.CONFIGS)
    for k, v in args._get_kwargs():
        if not VARS.get(k):
            VARS[k] = v

    index_path = os.path.join(book_path, 'index.txt')

    split_paragraphs = not VARS['no_split']
    if os.path.isfile(index_path):
        with open(index_path, 'r') as f:
            content = ''
            for filename in f.readlines():
                if VARS['no_split']:
                    content += latex_chapter(
                        os.path.join(book_path, filename).strip(),
                        split_paragraphs)
                else:
                    content += latex_single(
                        os.path.join(book_path,
                                     filename).strip(), split_paragraphs,
                        VARS['sections'], VARS['new_page_before_sections'])
            VARS['CONTENT'] = content
    else:
        text_files = [
            f for f in glob.glob(os.path.join(book_path, '*.txt'))
            if not f.endswith('words.txt')
        ]
        if text_files:
            VARS['CONTENT'] = latex_single(text_files[0], split_paragraphs,
                                           VARS['sections'],
                                           VARS['new_page_before_sections'])

    sep_path = os.path.join(book_path, 'words.txt')
    if os.path.isfile(sep_path):
        with open(sep_path, 'r') as f:
            hyphenation = ''
            for word in f.readlines():
                hyphenation += latex_hyphenation(word.strip())
            VARS['HYPHENATION'] = hyphenation

    TEMPLATE = 'template.tex'
    local_template_path = os.path.join(book_path, 'template.tex')
    if os.path.isfile(local_template_path):
        template = latex_env.from_string(open(local_template_path).read())
    else:
        template = latex_env.get_template(TEMPLATE)

    base_filename = VARS['BASE_FILENAME']
    tex_file = filepath(book_path, base_filename, 'tex')

    with open(tex_file, 'w') as f:
        f.write(template.render(**VARS))

    if not args.only_tex:
        if args.pdf or not args.epub:
            pdf_file = generate_pdf(book_path, base_filename, tex_file)
            if args.booklet:
                generate_booklet(
                    pdf_file, filepath(book_path, base_filename,
                                       'booklet.pdf'))
        if args.epub:
            generate_epub(book_path, base_filename, tex_file)
Beispiel #13
0
def generate_latex(args):
    ''' !!! Missing docstring.'''

    def get_max_val(lst):
        ''' !!! Missing docstring.'''
        flattened = []
        for item in lst:
            if isinstance(item, (int, float)):
                flattened.append(item)
            elif isinstance(item, list):
                flattened.extend(item)
        return max(flattened) if flattened else None

    # Load settings
    dbtex, imdescr = load_settings(args, config.IMGS)

    # Display information
    utils.display_sequence(dbtex, 'Datasets', symbol='-')
    utils.display_sequence(imdescr, 'Descriptors', symbol='-')
    utils.display_sequence(
        [est[0].__name__ for est in config.ESTIMATORS],
        'Classifiers', symbol='-')
    utils.display_message('Generating LaTeX code', symbol='*')

    # Generate introductory sections
    code = introduction_validation(
        config.DATA, config.N_TESTS, config.TEST_SIZE, config.N_FOLDS)
    code += introduction_classifiers(config.ESTIMATORS)
    code += introduction_dimensions(imdescr)
    code += introduction_parameters()
    code = [code]


    # `sects`: names of the used descriptors (sorted alphabetically)
    sections = sorted(set(d.__class__.__name__ for d in imdescr))

#    # !!! Refactoring required
    for s in sections:
        # `s`: section title (is a descriptor name)
        code.append(r'\section*{{{0}}}'.format(s))
        tups = [tuple(d.radius) for d in imdescr if d.__class__.__name__ == s]
        # `rlst`: radii considered for descriptor `s`
        rlst = [*map(list, sorted(set(tups), key=lambda x: (len(x), x)))]
        # `osect`: orders considered for descriptor `s`
        #osect = sorted(set(d.order for d in imdescr if d.__class__.__name__ == s), key=lambda x: len(x))
        osect = hep._orders
        for k, (clf, params) in enumerate(config.ESTIMATORS):
            if k > 0:
                code.append(r'\newpage')
            code.append(r'\subsection*{{{0}}}'.format(clf.__name__))
            code.append(r'\begin{{longtable}}{{ll{0}}}'.format('r'*len(osect)))
            heading = r' & '.join([o.capitalize() for o in osect])
            code.append(r'Dataset & Radius & {0} \\'.format(heading))
            code.append(r'\hline')
            for db in dbtex:
                for i, r in enumerate(rlst):
                    if i == 0:
                        line = r'{0} & {1} '.format(db, r)
                    else:
                        line = r' & {0} '.format(r)
                    vals = []
                    for o in osect:
                    #for o in hep._orders:
                        # `same`: list of descriptors with the same name, radius and order
                        same = [d for d in imdescr if d.__class__.__name__ == s and d.radius == r and d.order == o]
                        if not same:
                            #print('Not required:  {}--{}--{}--{}--{}'.format(db, s, r, o, clf.__name__))
                            vals.append(None)
                        elif len(same) == 1:
                            result_path = utils.filepath(config.DATA, db, same[0], clf)
                            if os.path.isfile(result_path):
                                #print('Reading single:  ', result_path)
                                result = utils.load_object(result_path)
                                acc = 100*np.mean([ts for g, ts in result])
                                vals.append(acc)
                            else:
                                print('Not found (single):  ', result_path)
                                vals.append(None)
                        else:
                            accs = []
                            for descr in same:
                                result_path = utils.filepath(config.DATA, db, descr, clf)
                                if os.path.isfile(result_path):
                                    #print('Reading multiple:  ', result_path)
                                    result = utils.load_object(result_path)
                                    accs.append(100*np.mean([ts for g, ts in result]))
                                else:
                                    print('Not found (multi):  ', result_path)
                            if not accs:
                                vals.append(None)
                            elif len(accs) == 1:
                                vals.append(accs[0])
                            elif len(accs) > 1:
                                vals.append([np.min(accs), np.max(accs)])
                    for v in vals:
                        maxval = get_max_val(vals)
                        if v is None:
                            line += r'& '
                        elif isinstance(v, (int, float)):
                            if v == maxval:
                                line += r'& \bfseries{{{0:.1f}}} '.format(v)
                            else:
                                line += r'& {0:.1f} '.format(v)
                        elif isinstance(v, list):
                            if v[1] == maxval:
                                line += r'& \bfseries{{{0:.1f}--{1:.1f}}} '.format(v[0], v[1])
                            else:
                                line += r'& {0:.1f}--{1:.1f} '.format(v[0], v[1])
                    line += r'\\'
                    code.append(line)

            code.append(r'\end{longtable}')

        code.append(r'\newpage')
    code.append(r'\end{document}')

    latex_path = os.path.join(config.DATA, 'results.tex')
    with open(latex_path, 'w') as fid:
        fid.write('\n'.join(code))
Beispiel #14
0
#    ensure_dir_exists(destination)
#    download_KylbergSintorn(destination, x=4, y=4)

def job_script(folder, job_id, partition, datasets, descriptors, estimators=None)#data, loops):
    """
    !!!
    """
    for dat, descr in itertools.product(datasets, descriptors):
        dat_id = dat.acronym
        for rad in descr.radius:
            descr_single = copy.deepcopy(descr)
            descr_single.radius = [rad]
            descr_single_id = descr_single.abbrev()
            feat_args = [folder, dat_id, descr_single_id]
            if estimators is None:
                this_one = utils.filepath(*feat_args)
            else:
                for clf, _ in estimators:
                    res_args = feat_args + [clf.__name__]
                    this_one = utils.filepath(*res_args)



    datasets, descriptors, estimators = loops
    utils.display_sequence(datasets, 'Datasets', symbol='-')
    utils.display_sequence(descriptors, 'Descriptors', symbol='-')
    utils.display_sequence(
        [est[0].__name__ for est in estimators], 'Classifiers', symbol='-')

    utils.display_message('Creating arguments file and job script', symbol='*')