Esempio n. 1
0
def set_cache_dir(cachedir, bytes_limit=10*2**30):
    cache = joblib.Memory(cachedir=cachedir, bytes_limit=bytes_limit,
                          verbose=0).cache
    cache_data.cache = cache
    cache_data.cachedir = cachedir
    fsc_cached = cache(fourierseries._base_fourier_component)
    fourierseries._fourier_component = fsc_cached
Esempio n. 2
0
def generate_tsdiffana_thumbnail(image_files,
                                 sessions,
                                 subject_id,
                                 output_dir,
                                 results_gallery=None,
                                 tooltips=None):
    """Generate tsdiffana thumbnails

    Parameters
    ----------
    image_files: list or strings or list
        paths (4D case) to list of paths (3D case) of images under inspection

    output_dir: string
        dir to which all output whill be written

    subject_id: string
        id of subject under inspection

    sessions: list
        list of session ids, one per element of image_files

    result_gallery: ResultsGallery instance (optional)
        gallery to which thumbnails will be committed

    """
    # plot figures
    qa_cache_dir = os.path.join(output_dir, "QA")
    if not os.path.exists(qa_cache_dir):
        os.makedirs(qa_cache_dir)
    qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5)
    results = qa_mem.cache(multi_session_time_slice_diffs)(image_files)
    axes = plot_tsdiffs(results, use_same_figure=False)
    figures = [ax.get_figure() for ax in axes]
    output_filename_template = os.path.join(output_dir,
                                            "tsdiffana_plot_{0}.png")
    output_filenames = [
        output_filename_template.format(i) for i in range(len(figures))
    ]
    for fig, output_filename in zip(figures, output_filenames):
        fig.savefig(output_filename, bbox_inches="tight", dpi=200)
        pl.close(fig)

    if tooltips is None:
        tooltips = [None] * len(output_filename)

    # create thumbnails
    thumbnails = []
    for output_filename, tooltip in zip(output_filenames, tooltips):
        thumbnail = Thumbnail(tooltip=tooltip)
        thumbnail.a = a(href=os.path.basename(output_filename))
        thumbnail.img = img(src=os.path.basename(output_filename),
                            height="250px",
                            width="600px")
        thumbnail.description = "tsdiffana ({0} sessions)".format(
            len(sessions))
        thumbnails.append(thumbnail)
    if results_gallery:
        results_gallery.commit_thumbnails(thumbnails)
    return thumbnails
Esempio n. 3
0
def resample_img(input_img_filename, new_vox_dims, output_filename=None):
    """
    Resamples an image to a new resolution

    Parameters
    ----------
    input_img_filename: string
        path to image to be resampled

    new_vox_dims: list or tuple of +ve floats
        new vox dimensions to which the image is to be resampled

    output_filename: string (optional)
        where output image will be written

    Returns
    -------
    output_filename: string
        where the resampled img has been written

    """

    try:
        from nilearn.image import resample_img as ni_resample_img
    except ImportError:
        raise RuntimeError(
            "nilearn not found on your system; can't do resampling!")

    # sanity
    if output_filename is None:
        output_filename = os.path.join(
            os.path.dirname(input_img_filename),
            "resample_" + os.path.basename(input_img_filename))

    # prepare for smart-caching
    output_dir = os.path.dirname(output_filename)
    cache_dir = os.path.join(output_dir, "resample_img_cache")
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    mem = joblib.Memory(cachedir=cache_dir, verbose=5)

    # resample input img to new resolution
    resampled_img = mem.cache(ni_resample_img)(
        input_img_filename, target_affine=np.diag(new_vox_dims))

    # save resampled img
    nibabel.save(resampled_img, output_filename)

    return output_filename
Esempio n. 4
0
    def transform(self, X, y=None):
        """Extract features from the array X.

        Parameters
        ----------
        X : ndarray, shape (n_epochs, n_channels, n_times)

        y : None
            Only for compatibility with :class:`~sklearn.pipeline.Pipeline`.

        Returns
        -------
        Xnew : ndarray, shape (n_epochs, n_features)
            Extracted features.
        """
        mem = joblib.Memory(cachedir=self.memory)
        _extractor = mem.cache(extract_features)
        return _extractor(X, self.sfreq, self.selected_funcs,
                          funcs_params=self.params, n_jobs=self.n_jobs)
    def transform(self, X):
        """Extract features from the array X.

        Parameters
        ----------
        X : ndarray, shape (n_epochs, n_channels, n_times)

        Returns
        -------
        Xnew : ndarray, shape (n_epochs, n_features)
            Extracted features.
        """
        mem = joblib.Memory(location=self.memory)
        _extractor = mem.cache(extract_features)
        return _extractor(X,
                          self.sfreq,
                          self.selected_funcs,
                          funcs_params=self.params,
                          n_jobs=self.n_jobs)
Esempio n. 6
0
def do_3Dto4D_merge(threeD_img_filenames,
                    output_dir=None,
                    output_filename=None):
    """
    This function produces a single 4D nifti image from several 3D.

    threeD_img_filenames: list of string
        paths to images to be merged

    Returns
    -------
    returns nifit image object

    """

    if isinstance(threeD_img_filenames, _basestring):
        return nibabel.load(threeD_img_filenames)

    if output_dir is None:
        output_dir = tempfile.mkdtemp()

    # prepare for smart caching
    merge_cache_dir = os.path.join(output_dir, "merge")
    if not os.path.exists(merge_cache_dir):
        os.makedirs(merge_cache_dir)
    merge_mem = joblib.Memory(cachedir=merge_cache_dir, verbose=5)

    # merging proper
    fourD_img = merge_mem.cache(nibabel.concat_images)(threeD_img_filenames,
                                                       check_affines=False)

    # sanity
    if len(fourD_img.shape) == 5:
        fourD_img = nibabel.Nifti1Image(
            fourD_img.get_data()[..., ..., ..., 0, ...],
            fourD_img.get_affine())

    # save image to disk
    if output_filename is not None:
        merge_mem.cache(nibabel.save)(fourD_img, output_filename)

    return fourD_img
Esempio n. 7
0
# display figures in the notebook
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.externals import joblib
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

m = joblib.Memory(cachedir='/tmp')


@m.cache()
def make_curves(random_state=42):
    digits = load_digits()
    rng = np.random.RandomState(random_state)

    data = np.asarray(digits.data, dtype='float32')
    target = np.asarray(digits.target, dtype='int32')

    # Add noise in the labels to cause more overfitting
    target[:200] = rng.randint(0, 10, size=200)

    X_train, X_test, y_train, y_test = train_test_split(
        data, target, test_size=0.15, random_state=random_state)

    # mean = 0 ; standard deviation = 1.0
Esempio n. 8
0
def generate_segmentation_thumbnails(normalized_files,
                                     output_dir,
                                     subject_gm_file=None,
                                     subject_wm_file=None,
                                     subject_csf_file=None,
                                     only_native=False,
                                     brain='func',
                                     comments="",
                                     execution_log_html_filename=None,
                                     cmap=None,
                                     tooltip=None,
                                     results_gallery=None):
    """Generates thumbnails after indirect normalization
    (segmentation + normalization)

    Parameters
    ----------
    normalized_file: list
        paths to normalized images (3Ds or 4Ds)

    output_dir: string
        dir to which all output will be written

    subject_gm_file: string (optional)
        path to subject GM file

    subject_csf_file: string (optional)
        path to subject WM file

    subject_csf_file: string (optional)
        path to subject CSF file

    brain: string (optional)
        a short commeent/tag like 'epi', or 'anat'

    cmap: optional
        cmap (color map) to use for plots

    result_gallery: ResultsGallery instance (optional)
        gallery to which thumbnails will be committed

    """
    if isinstance(normalized_files, _basestring):
        normalized_file = normalized_files
    else:
        mean_normalized_file = os.path.join(output_dir, "%s.nii" % brain)
        compute_mean_3D_image(normalized_files,
                              output_filename=mean_normalized_file)
        normalized_file = mean_normalized_file
    output = {}

    # prepare for smart caching
    qa_cache_dir = os.path.join(output_dir, "QA")
    if not os.path.exists(qa_cache_dir):
        os.makedirs(qa_cache_dir)
    qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5)

    thumb_desc = "Segmentation of %s " % brain
    if execution_log_html_filename:
        thumb_desc += (" (<a href=%s>see execution "
                       "log</a>)") % (
                           os.path.basename(execution_log_html_filename))
    _brain = "(%s) %s" % (comments, brain) if comments else brain

    # plot contours of template compartments on subject's brain
    if not only_native:
        template_compartments_contours = os.path.join(
            output_dir, "template_tpms_contours_on_%s.png" % _brain)
        template_compartments_contours_axial = os.path.join(
            output_dir,
            "template_compartments_contours_on_%s_axial.png" % _brain)
        qa_mem.cache(plot_segmentation)(
            normalized_file,
            GM_TEMPLATE,
            wm_filename=WM_TEMPLATE,
            csf_filename=CSF_TEMPLATE,
            display_mode='z',
            cmap=cmap,
            output_filename=template_compartments_contours_axial,
            title="template TPMs",
            close=True)
        qa_mem.cache(plot_segmentation)(
            normalized_file,
            gm_filename=GM_TEMPLATE,
            wm_filename=WM_TEMPLATE,
            csf_filename=CSF_TEMPLATE,
            output_filename=template_compartments_contours,
            cmap=cmap,
            close=True,
            title=("Template GM, WM, and CSF TPM contours on "
                   "subject's %s") % _brain)

        # create thumbnail
        if results_gallery:
            thumbnail = Thumbnail(tooltip=tooltip)
            thumbnail.a = a(
                href=os.path.basename(template_compartments_contours))
            thumbnail.img = img(
                src=os.path.basename(template_compartments_contours),
                height="250px")
            thumbnail.description = thumb_desc

            results_gallery.commit_thumbnails(thumbnail)

        output['axial'] = template_compartments_contours_axial

    # plot contours of subject's compartments on subject's brain
    if subject_gm_file:
        subject_compartments_contours = os.path.join(
            output_dir, "subject_tpms_contours_on_subject_%s.png" % _brain)
        subject_compartments_contours_axial = os.path.join(
            output_dir,
            "subject_tpms_contours_on_subject_%s_axial.png" % _brain)

        qa_mem.cache(plot_segmentation)(
            normalized_file,
            subject_gm_file,
            wm_filename=subject_wm_file,
            csf_filename=subject_csf_file,
            display_mode='z',
            cmap=cmap,
            output_filename=subject_compartments_contours_axial,
            close=True,
            title="subject TPMs")

        title_prefix = "Subject's GM"
        if subject_wm_file:
            title_prefix += ", WM"
        if subject_csf_file:
            title_prefix += ", and CSF"
        qa_mem.cache(plot_segmentation)(
            normalized_file,
            subject_gm_file,
            wm_filename=subject_wm_file,
            csf_filename=subject_csf_file,
            cmap=cmap,
            close=True,
            output_filename=subject_compartments_contours,
            title=("%s TPM contours on "
                   "subject's %s") % (title_prefix, _brain))

        # create thumbnail
        if results_gallery:
            thumbnail = Thumbnail(tooltip=tooltip)
            thumbnail.a = a(
                href=os.path.basename(subject_compartments_contours))
            thumbnail.img = img(
                src=os.path.basename(subject_compartments_contours),
                height="250px")
            thumbnail.description = thumb_desc

            results_gallery.commit_thumbnails(thumbnail)

        if only_native:
            output['axial'] = subject_compartments_contours_axial

    return output
Esempio n. 9
0
def generate_registration_thumbnails(target,
                                     source,
                                     procedure_name,
                                     output_dir,
                                     tooltip=None,
                                     execution_log_html_filename=None,
                                     results_gallery=None):
    """
    Generates QA thumbnails post-registration.

    Parameters
    ----------
    target: tuple of length 2
        target[0]: string
            path to reference image used in the registration
        target[1]: string
            short name (e.g 'anat', 'epi', 'MNI', etc.) for the
            reference image
    source: tuple of length 2
        source[0]: string
            path to source image
        source[1]: string
            short name (e.g 'anat', 'epi', 'MNI', etc.) for the
            source image
    procedure_name: string
        name of, or short comments on, the registration procedure used
        (e.g 'anat ==> func', etc.)

    """
    output = {}

    # prepare for smart caching
    qa_cache_dir = os.path.join(output_dir, "QA")
    if not os.path.exists(qa_cache_dir):
        os.makedirs(qa_cache_dir)
    qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5)

    thumb_desc = procedure_name
    if execution_log_html_filename:
        thumb_desc += " (<a href=%s>see execution log</a>)" % (
            os.path.basename(execution_log_html_filename))

    # plot outline (edge map) of template on the
    # normalized image
    outline = os.path.join(output_dir,
                           "%s_on_%s_outline.png" % (target[1], source[1]))

    qa_mem.cache(plot_registration)(target[0],
                                    source[0],
                                    output_filename=outline,
                                    close=True,
                                    title="Outline of %s on %s" %
                                    (target[1], source[1]))

    # create thumbnail
    if results_gallery:
        thumbnail = Thumbnail(tooltip=tooltip)
        thumbnail.a = a(href=os.path.basename(outline))
        thumbnail.img = img(src=os.path.basename(outline), height="250px")
        thumbnail.description = thumb_desc

        results_gallery.commit_thumbnails(thumbnail)

    # plot outline (edge map) of the normalized image
    # on the SPM MNI template
    source, target = (target, source)
    outline = os.path.join(output_dir,
                           "%s_on_%s_outline.png" % (target[1], source[1]))
    outline_axial = os.path.join(
        output_dir, "%s_on_%s_outline_axial.png" % (target[1], source[1]))
    qa_mem.cache(plot_registration)(target[0],
                                    source[0],
                                    output_filename=outline_axial,
                                    close=True,
                                    display_mode='z',
                                    title="Outline of %s on %s" %
                                    (target[1], source[1]))
    output['axial'] = outline_axial
    qa_mem.cache(plot_registration)(target[0],
                                    source[0],
                                    output_filename=outline,
                                    close=True,
                                    title="Outline of %s on %s" %
                                    (target[1], source[1]))

    # create thumbnail
    if results_gallery:
        thumbnail = Thumbnail(tooltip=tooltip)
        thumbnail.a = a(href=os.path.basename(outline))
        thumbnail.img = img(src=os.path.basename(outline), height="250px")
        thumbnail.description = thumb_desc
        results_gallery.commit_thumbnails(thumbnail)

    return output
Esempio n. 10
0
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split, GridSearchCV

from sklearn.datasets import load_svmlight_file
from sklearn.decomposition import PCA, LatentDirichletAllocation

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier

from easyml.utils import util

memory = joblib.Memory("./mycache")


@memory.cache
def get_data(file_name):
    data = load_svmlight_file(file_name)
    return data[0], data[1]


def arff2svm(arff_files):
    svm_files = []
    for arff_file in arff_files:
        name = arff_file[0:arff_file.rindex('.')]
        tpe = arff_file[arff_file.rindex('.') + 1:]
        svm_file = name + ".libsvm"
        svm_files.append(svm_file)
Esempio n. 11
0
from urllib import request

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import tree
from sklearn.externals import joblib
import starboost as sb

HERE = './'
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
m = joblib.Memory(location='/tmp', mmap_mode='r')

MAX_DEPTH = 3
N_ESTIMATORS = 100
LEARNING_RATE = 0.1
SUBSAMPLE = 50000


@m.cache
def load_data():
    filename = os.path.join(HERE, URL.rsplit('/', 1)[-1])
    if not os.path.exists(filename):
        print(f'Downloading {URL} to {filename}...')
        request.urlretrieve(URL, filename)
    print(f'Parsing {filename}...')
    with gzip.GzipFile(filename) as f:
Esempio n. 12
0
    # We can combine our feature extraction, selection and final SVC in one step
    svc = LinearSVC()
    pipeline = Pipeline([('vectorize', vectorizer), ('select', selector),
                         ('svc', svc)])
    cross_val_score(pipeline, X, y, verbose=3)
    # [CV] no parameters to be set .........................................
    # [CV] ................ no parameters to be set, score=0.888212 -   4.2s
    # [Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    4.2s
    # [CV] no parameters to be set .........................................
    # [CV] ................ no parameters to be set, score=0.891068 -   4.2s
    # [CV] no parameters to be set .........................................
    # [CV] ................ no parameters to be set, score=0.888741 -   4.4s
    # [Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.8s finished

    # Parameter selection - pipeline returned a lot of different parameters
    # Pipeline object exposes the parameters of the estimators it wraps with
    # the following convention: name of the estimator, __, name of parameter
    pipeline.set_params(svc__C=10)  # set SVC's C parameter

    # Choosing parameters by cross-validation may imply running transformers
    # many times on the same data with the same parameters.  Can avoid overhead
    # by using joblib's memory
    memory = joblib.Memory(cachedir='.')
    memory.clear()
    selector.score_func = memory.cache(selector.score_func)

    # GridSearchCV - Use gridsearch to choose the best C between 3 values
    grid = GridSearchCV(estimator=pipeline,
                        param_grid=dict(svc__C=[1e-2, 1, 1e2]))
    grid.fit(X, y)
    print grid.best_estimator_.named_steps['svc']
Esempio n. 13
0
File: run.py Progetto: cemoody/rtvf
import sys
import scipy
import numpy as np
from model import fit
from sklearn.externals import joblib

mem = joblib.Memory('mem')


def load_image(fn, xshift=0, yshift=0, downsample=8, transpose=False):
    x = scipy.misc.imread(fn).astype('float32') / 255.
    m = int(min(x.shape[:2]) / 2)
    xc, yc = [int(s / 2) for s in x.shape[:2]]
    xc += xshift
    yc += yshift
    y = x[xc - m:xc + m, yc - m:yc + m, :]
    if downsample:
        y = y[::downsample, ::downsample, :]
    if transpose:
        y = y.transpose((2, 0, 1))
    assert y.shape[0] == y.shape[1]
    # grey = y.mean(axis=0)
    # flatten = np.ravel(grey)
    return y


@mem.cache
def load_all(fns):
    X = np.array([load_image(fn, yshift=-150) for fn in fns])
    return X
Esempio n. 14
0
PROJECT_DIR = os.path.normpath(os.path.join(os.path.dirname(__file__),
                                            os.pardir,
                                            os.pardir))
RAW_DATA_DIR = os.path.join(PROJECT_DIR, 'data', 'raw')
RAW_DATA_DIR_DUMMY = os.path.join(PROJECT_DIR, 'data', 'raw_dummy')
DATA_FILE_NAMES = dict(results='parameters_and_results.h5',
                       fields='field_data_{}_{}.h5')

# Helper to switch between real and dummy data
_USE_DUMMIES = {'do': False}

# Helper to toggle cache usage
_USE_CACHE = {'do': False}

# A global joblib cache to persist output of functions.
MEMORY = joblib.Memory(cachedir=os.path.abspath('cache'), verbose=0)


def format_time(t):
    """Returns a well formatted time string."""
    return str(timedelta(seconds=t))


def set_dummy_mode(use_dummies, verbose=True):
    """
    Convenience function to toggle dummy mode.

    Parameters
    ----------
    use_dummies : bool
        Whether to use dummy data.
Esempio n. 15
0
import logging
from pathlib import Path

import sklearn.datasets

from sklearn.externals import joblib

location = Path(__file__).resolve().parent.parent / '.cache'
location = str(location)
mem = joblib.Memory(location=location, verbose=logging.DEBUG)


@mem.cache
def load_svmlight_file(*args, **kwargs):
    return sklearn.datasets.load_svmlight_file(*args, **kwargs)
def main(inputs,
         infile_estimator,
         infile1,
         infile2,
         outfile_result,
         outfile_object=None,
         groups=None):
    """
    Parameter
    ---------
    inputs : str
        File path to galaxy tool parameter

    infile_estimator : str
        File path to estimator

    infile1 : str
        File path to dataset containing features

    infile2 : str
        File path to dataset containing target values

    outfile_result : str
        File path to save the results, either cv_results or test result

    outfile_object : str, optional
        File path to save searchCV object

    groups : str
        File path to dataset containing groups labels
    """

    warnings.simplefilter('ignore')

    with open(inputs, 'r') as param_handler:
        params = json.load(param_handler)
    if groups:
        (params['search_schemes']['options']['cv_selector']['groups_selector']
         ['infile_g']) = groups

    params_builder = params['search_schemes']['search_params_builder']

    input_type = params['input_options']['selected_input']
    if input_type == 'tabular':
        header = 'infer' if params['input_options']['header1'] else None
        column_option = (params['input_options']['column_selector_options_1']
                         ['selected_column_selector_option'])
        if column_option in [
                'by_index_number', 'all_but_by_index_number', 'by_header_name',
                'all_but_by_header_name'
        ]:
            c = params['input_options']['column_selector_options_1']['col1']
        else:
            c = None
        X = read_columns(infile1,
                         c=c,
                         c_option=column_option,
                         sep='\t',
                         header=header,
                         parse_dates=True).astype(float)
    else:
        X = mmread(open(infile1, 'r'))

    header = 'infer' if params['input_options']['header2'] else None
    column_option = (params['input_options']['column_selector_options_2']
                     ['selected_column_selector_option2'])
    if column_option in [
            'by_index_number', 'all_but_by_index_number', 'by_header_name',
            'all_but_by_header_name'
    ]:
        c = params['input_options']['column_selector_options_2']['col2']
    else:
        c = None
    y = read_columns(infile2,
                     c=c,
                     c_option=column_option,
                     sep='\t',
                     header=header,
                     parse_dates=True)
    y = y.ravel()

    optimizer = params['search_schemes']['selected_search_scheme']
    optimizer = getattr(model_selection, optimizer)

    options = params['search_schemes']['options']

    splitter, groups = get_cv(options.pop('cv_selector'))
    options['cv'] = splitter
    options['n_jobs'] = N_JOBS
    primary_scoring = options['scoring']['primary_scoring']
    options['scoring'] = get_scoring(options['scoring'])
    if options['error_score']:
        options['error_score'] = 'raise'
    else:
        options['error_score'] = np.NaN
    if options['refit'] and isinstance(options['scoring'], dict):
        options['refit'] = primary_scoring
    if 'pre_dispatch' in options and options['pre_dispatch'] == '':
        options['pre_dispatch'] = None

    with open(infile_estimator, 'rb') as estimator_handler:
        estimator = load_model(estimator_handler)

    memory = joblib.Memory(location=CACHE_DIR, verbose=0)
    # cache iraps_core fits could increase search speed significantly
    if estimator.__class__.__name__ == 'IRAPSClassifier':
        estimator.set_params(memory=memory)
    else:
        for p, v in estimator.get_params().items():
            if p.endswith('memory'):
                if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                    # cache iraps_core fits could increase search
                    # speed significantly
                    new_params = {p: memory}
                    estimator.set_params(**new_params)
                elif v:
                    new_params = {p, None}
                    estimator.set_params(**new_params)
            elif p.endswith('n_jobs'):
                new_params = {p: 1}
                estimator.set_params(**new_params)

    param_grid = _eval_search_params(params_builder)
    searcher = optimizer(estimator, param_grid, **options)

    # do train_test_split
    do_train_test_split = params['train_test_split'].pop('do_split')
    if do_train_test_split == 'yes':
        # make sure refit is choosen
        if not options['refit']:
            raise ValueError("Refit must be `True` for shuffle splitting!")
        split_options = params['train_test_split']

        # splits
        if split_options['shuffle'] == 'stratified':
            split_options['labels'] = y
            X, X_test, y, y_test = train_test_split(X, y, **split_options)
        elif split_options['shuffle'] == 'group':
            if not groups:
                raise ValueError("No group based CV option was "
                                 "choosen for group shuffle!")
            split_options['labels'] = groups
            X, X_test, y, y_test, groups, _ =\
                train_test_split(X, y, **split_options)
        else:
            if split_options['shuffle'] == 'None':
                split_options['shuffle'] = None
            X, X_test, y, y_test =\
                train_test_split(X, y, **split_options)
    # end train_test_split

    if options['error_score'] == 'raise':
        searcher.fit(X, y, groups=groups)
    else:
        warnings.simplefilter('always', FitFailedWarning)
        with warnings.catch_warnings(record=True) as w:
            try:
                searcher.fit(X, y, groups=groups)
            except ValueError:
                pass
            for warning in w:
                print(repr(warning.message))

    if do_train_test_split == 'no':
        # save results
        cv_results = pandas.DataFrame(searcher.cv_results_)
        cv_results = cv_results[sorted(cv_results.columns)]
        cv_results.to_csv(path_or_buf=outfile_result,
                          sep='\t',
                          header=True,
                          index=False)

    # output test result using best_estimator_
    else:
        best_estimator_ = searcher.best_estimator_
        if isinstance(options['scoring'], collections.Mapping):
            is_multimetric = True
        else:
            is_multimetric = False

        test_score = _score(best_estimator_,
                            X_test,
                            y_test,
                            options['scoring'],
                            is_multimetric=is_multimetric)
        if not is_multimetric:
            test_score = {primary_scoring: test_score}
        for key, value in test_score.items():
            test_score[key] = [value]
        result_df = pandas.DataFrame(test_score)
        result_df.to_csv(path_or_buf=outfile_result,
                         sep='\t',
                         header=True,
                         index=False)

    memory.clear(warn=False)

    if outfile_object:
        with open(outfile_object, 'wb') as output_handler:
            pickle.dump(searcher, output_handler, pickle.HIGHEST_PROTOCOL)
Esempio n. 17
0
def make_lens_catalog(args):
    """
    NAME
        make_lens_catalog

    PURPOSE
        Given location of collection pickle, this script produces a set of
        annotated images of lenses (heatmaps for lens locations, markers for
        where clicks were, etc).

    COMMENTS
        You have to download the file so it chooses whever your output
        directory is to also download the raw images.
        This should be pretty customizable.

    FLAGS
        -h              Print this message

        --skill         Weight by skill


    INPUTS
        collection.pickle

    OUTPUTS
        lens.dat
            Assumed format:
            ID   kind   x   y    Prob     N0   Skill   Dist

            Here:
            ID = Space Warps subject ID
            kind = Space Warps subject type (sim, dud, test)
            x,y = object (cluster) centroid, in pixels
            P = Space Warps subject probability
            N0 = number of markers in the cluster
            S = total skill per cluster, summed over markers
            D = biggest distance within cluster

    EXAMPLE

    BUGS

    AUTHORS
        This file is part of the Space Warps project, and is distributed
        under the GPL v2 by the Space Warps Science Team.
        http://spacewarps.org/

    HISTORY
        2013-07-16  started Davis (KIPAC)
    """

    # ------------------------------------------------------------------
    # Some defaults:

    flags = {
        'skill': False,
        'output_directory': './',
        'output_name': 'catalog.dat',
        'image_y_size': 440,
        'catalog_path': '',
        'update_collection': '',
    }

    # ------------------------------------------------------------------
    # Read in options:

    # this has to be easier to do...
    for arg in args:
        if arg in flags:
            flags[arg] = args[arg]
        elif arg == 'collection_path':
            collection_path = args[arg]
        else:
            print "make_lens_atlas: unrecognized flag ", arg

    print "make_lens_catalog: illustrating behaviour captured in collection file: "
    print "make_lens_catalog: ", collection_path

    memory = joblib.Memory(cachedir=flags['output_directory'])
    memory.clear()

    catalog_path = flags['output_directory'] + flags['output_name']
    if len(flags['output_name']) > 0:
        F = open(catalog_path, 'w')
        F.write('id,kind,x,y,prob,n0,skill,dist\n')

    # ------------------------------------------------------------------
    # Read in files:

    collection = swap.read_pickle(collection_path, 'collection')
    ID_list = collection.list()
    print "make_lens_catalog: collection numbers ", len(ID_list)

    if flags['catalog_path'] != '':
        print "make_lens_catalog: filtering from catalog ", flags[
            'catalog_path']
        catalog_in = csv2rec(flags['catalog_path'])
        ID_list = np.unique(catalog_in['id'])

    # ------------------------------------------------------------------
    # Run through data:

    catalog = {}
    for ID in ID_list:

        subject = collection.member[ID]
        kind = subject.kind
        P = subject.mean_probability

        itwas = subject.annotationhistory['ItWas']
        x_all = subject.annotationhistory['At_X']
        y_all = subject.annotationhistory['At_Y']

        x_markers = np.array([xi for xj in x_all for xi in xj])
        y_markers = np.array([yi for yj in y_all for yi in yj])

        catalog.update(
            {ID: {
                'agents_reject': [],
                'x': x_markers,
                'y': y_markers,
            }})
        PL_all = subject.annotationhistory['PL']
        PD_all = subject.annotationhistory['PD']

        # filter out the empty clicks
        PL_list = []
        PL_nots = []
        for i, xj in enumerate(x_all):
            # len(xj) of empty = 0
            PL_list.append([PL_all[i]] * len(xj))
            if len(xj) == 0:
                PL_nots.append(PL_all[i])
        PL = np.array([PLi for PLj in PL_list for PLi in PLj])
        PL_nots = np.array(PL_nots)

        # filter out the empty clicks
        PD_list = []
        PD_nots = []
        for i, xj in enumerate(x_all):
            PD_list.append([PD_all[i]] * len(xj))
            if len(xj) == 0:
                PD_nots.append(PD_all[i])
                catalog[ID]['agents_reject'].append(i)
        PD = np.array([PDi for PDj in PD_list for PDi in PDj])
        PD_nots = np.array(PD_nots)

        skill = swap.expectedInformationGain(0.5, PL, PD)  # skill

        # it is only fair to write out the NOTs, too
        # do the empty guys
        skill_nots = swap.expectedInformationGain(0.5, PL_nots,
                                                  PD_nots)  # skill

        x, y = -1, -1
        N0 = len(skill_nots)
        S = np.sum(skill_nots)
        D = 0

        ## catalog.append((ID, kind, x, y, P, N0, S, D))
        if len(catalog) % 500 == 0:
            print len(catalog)
        if len(flags['output_name']) > 0:
            F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                ID, kind, x, y, P, N0, S, D))

        if len(x_markers) == 0:
            # apparently everyone was a not...
            catalog[ID]['agents_labels'] = np.array([])
            continue

        # ------------------------------------------------------------------
        # cluster
        print 'make_lens_catalog: subject ID = ', ID
        if flags['skill']:
            cluster_centers, cluster_center_labels, cluster_labels, \
                    n_clusters, dist_within = outlier_clusters(x_markers, y_markers, skill, memory=memory)
        else:
            cluster_centers, cluster_center_labels, cluster_labels, \
                    n_clusters, dist_within = outlier_clusters(x_markers, y_markers, None, memory=memory)
        # need to get: x, y, N0, S

        catalog[ID]['agents_labels'] = cluster_labels

        for cluster_center_label in cluster_center_labels:
            cluster_center = cluster_centers[cluster_center_label]
            members = (cluster_labels == cluster_center_label)

            x, y = cluster_center
            # convert y to catalog convention
            y = flags['image_y_size'] - y
            N0 = np.sum(members)
            S = np.sum(skill[members])
            D = dist_within[cluster_center_label]

            if cluster_center_label == -1:
                # outlier cluster
                # so really every point is its own cluster...
                D = 0
            ## catalog.append((ID, kind, x, y, P, N0, S, D))
            ## if len(catalog)%500 == 0:
            ##     print len(catalog)
            # TODO: make some requirement to be included (exclude outliers)
            if len(flags['output_name']) > 0:
                F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format(
                    ID, kind, x, y, P, N0, S, D))

    print 'make_lens_catalog: Clearing memory'
    # clear memory
    memory.clear()

    if len(flags['output_name']) > 0:
        print 'make_lens_catalog: closing file!'
        F.close()

    if len(flags['update_collection']) > 0:
        print 'make_lens_catalog: writing updated collection to', flags[
            'update_collection']

        # TODO: get the other params correct!!!!
        collection_fat = swap.collection.Collection()
        for ID in catalog:
            subject = collection.member[ID]
            atx = subject.annotationhistory['At_X']
            labels_in = list(catalog[ID]['agents_labels'])
            labels_fat = []
            for atx_i in atx:
                labels_fat.append([])
                for atx_ij in atx_i:
                    labels_fat[-1].append(labels_in.pop(0))
            subject.annotationhistory.update({'labels': labels_fat})
            collection_fat.member.update({ID: subject})
        swap.write_pickle(collection_fat, flags['update_collection'])

    print 'make_lens_catalog: All done!'

    return catalog
Esempio n. 18
0
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import os
import numpy as np
import pandas as pd
import seaborn as sns
import time
from sklearn.externals import joblib
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import OneClassSVM
from ..base import BASE_PATH
from .. import data_sets, feature_extraction

_file_memory = joblib.Memory(cachedir=os.path.join(BASE_PATH, 'cache'))


@_file_memory.cache
def _build_and_fit(ds_url: str) -> pd.DataFrame:
    result_list = []

    normal_list, anomalous_list = data_sets.get(ds_url)

    for i in range(1, 5):
        n = 10**i
        print('ds_url {} | n {:9,d}'.format(ds_url, n))

        train_list = []
        while len(train_list) < n:
            train_list += normal_list
Esempio n. 19
0
    def fit(self, X, y):
        super(LogExpPipeline, self).fit(X, np.log1p(y))

    def predict(self, X):
        return np.expm1(super(LogExpPipeline, self).predict(X))


import backtest
bt = backtest.Backtest()
bt.init_candle()
bt.resetbacktest()
bt.index = bt.size
bt.updateIndicators()
truth = bt.df

memory = joblib.Memory(cachedir=".")
n = truth.shape[1]
#
# XGBoost model
#
xgb_params = {}
xgb_params['objective'] = 'reg:linear'
xgb_params['learning_rate'] = 0.001
xgb_params['max_depth'] = int(6.0002117448743721)
xgb_params['max_depth'] = 9
xgb_params['subsample'] = 0.72476106045336319
xgb_params['min_child_weight'] = int(4.998433055249718)
#xgb_params['colsample_bytree'] = 0.97058965304691203
#xgb_params['colsample_bylevel'] = 0.69302144647951536
xgb_params['reg_alpha'] = 0.59125639278096453
xgb_params['gamma'] = 0.11900602913417056
Esempio n. 20
0
        if encoding == 'plain':
            pass
        elif encoding == 'gzip':
            data = StringIO(data)
            data = gzip.GzipFile(fileobj=data).read()
        else:
            raise RuntimeError('unknown encoding')
    else:
        with open(url, 'r') as fid:
            data = fid.read()
        fid.close()

    return data


mem = joblib.Memory(cachedir='_build')
get_data = mem.cache(_get_data)


def parse_sphinx_searchindex(searchindex):
    """Parse a Sphinx search index

    Parameters
    ----------
    searchindex : str
        The Sphinx search index (contents of searchindex.js)

    Returns
    -------
    filenames : list of str
        The file names parsed from the search index.
Esempio n. 21
0
# You'll also need to install Spacy & run:
# python -m spacy download en_core_web_sm

import spacy
import os.path
import numpy as np
import pandas as pd
import string
import tqdm
import random
from sklearn.externals import joblib

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_sm')
translator = str.maketrans('', '', string.punctuation)
mem = joblib.Memory('cache')


@mem.cache
def textify(fn):
    docs = []
    with open(fn, 'r') as fh:
        for j, line in enumerate(fh):
            # Skip first fields
            splits = line.split(',')
            word = splits[1]
            definition = ','.join(splits[5:])
            definition = definition.replace('"', '').replace('\n', '')
            docs.append(definition + ' ' + word)
    return docs