Beispiel #1
0
def partial_leave_one_out_test(model,
                               X,
                               y,
                               n_splits=None,
                               min_n_samples_per_unik_y=None,
                               verbose=1):
    loo = SupervisedLeaveOneOut(
        n_splits=n_splits, min_n_samples_per_unik_y=min_n_samples_per_unik_y)
    n_splits = loo.get_n_splits()
    if verbose > 0:
        printProgress("Number of tests: {}".format(n_splits))

    predicted = list()
    actual = list()
    for i, (train_idx, test_idx) in enumerate(loo.split(X, y), 1):
        XX = X[train_idx, :]
        yy = y[train_idx]
        if verbose > 0:
            printProgress('Test {}/{}'.format(i, n_splits),
                          refresh=True,
                          refresh_suffix='   ')
        model.fit(XX, yy)
        test_x = X[test_idx, :]
        test_y = y[test_idx]
        actual.append(test_y[0])
        predicted.append(model.predict(test_x)[0])

    return array(predicted), array(actual)
Beispiel #2
0
    def dict_of_source_data(self,
                            extractor=None,
                            start=None,
                            stop=None,
                            print_progress_every=None,
                            source_scan_iterator_kwargs={}):

        source_scan_iterator_kwargs[
            'scroll'] = source_scan_iterator_kwargs.get('scroll', '10m')
        extracting_scanner = self.source_scan_iterator(
            extractor=extractor,
            start=start,
            stop=stop,
            **source_scan_iterator_kwargs)
        print_progress_every = print_progress_every or np.inf
        start = start or 0

        d = list()
        for i, item in enumerate(extracting_scanner, start=start):
            if np.mod(i, print_progress_every) == 0:
                printProgress("offset: {}".format(i))
            if item is not None:
                d.append(item)

        return d
Beispiel #3
0
Datei: get.py Projekt: yz-/ut
def get_ftp_files_I_dont_have(ftp_kwargs,
                              remote_dir=".",
                              local_dir=".",
                              remote_filename_filter=None):
    """
    Getting files from a remote ftp folder to a local folder
    """
    ftp = FTP(**ftp_kwargs)

    # getting a (possibly filtered) list of remote files
    ftp_files = ftp.nlst(remote_dir)
    if remote_filename_filter is not None:
        printProgress('Remote files: %d' % len(ftp_files))
        try:
            ftp_files = filter(remote_filename_filter, ftp_files)
        except Exception:
            ftp_files = remote_filename_filter(ftp_files)
        printProgress('... After filtering, only %d files left that will be processed' % len(ftp_files))
    else:
        printProgress('Remote files: %d' % len(ftp_files))

    list_of_files_to_fetch = []
    for f in ftp_files:
        local_filepath = os.path.join(local_dir, f)
        if not os.path.exists(local_filepath):
            list_of_files_to_fetch.append([f])

    # for each of the files that local_directory doesn't have, retrieve it and store it locally
    printProgress("Fetching the %d files that (%s) didn't have" % \
                  (len(list_of_files_to_fetch), local_dir))
    for f in list_of_files_to_fetch:
        local_filepath = os.path.join(local_dir, f)
        remote_filepath = os.path.join(remote_dir, f)
        printProgress('Getting and storing file to %s' % local_filepath)
        get_file_from_ftp(ftp, remote_filepath, local_filepath)
Beispiel #4
0
def mk_termCounts(dat, indexColName, strColName, data_folder=''):
    """
    input: data_folder='', dataname, savename='',
    output: string of ascii char correspondents
      (replacing, for example, accentuated letters with non-accentuated versions of the latter)
    """
    from ut.util import log
    from ut.daf.get import get_data
    dat = get_data(dat,data_folder)
    log.printProgress("making {} word counts (wc)",strColName)
    sr = to_kw_fd(dat[strColName])
    sr.index = dat.hotel_id.tolist()
    return sr
Beispiel #5
0
def mk_termCounts(dat, indexColName, strColName, data_folder=''):
    """
    input: data_folder='', dataname, savename='',
    output: string of ascii char correspondents
      (replacing, for example, accentuated letters with non-accentuated versions of the latter)
    """
    from ut.util import log
    from ut.daf.get import get_data
    dat = get_data(dat,data_folder)
    log.printProgress("making {} word counts (wc)",strColName)
    sr = to_kw_fd(dat[strColName])
    sr.index = dat.hotel_id.tolist()
    return sr
Beispiel #6
0
Datei: util.py Projekt: yz-/ut
def stereo_to_mono_and_extreme_silence_cropping(source, target, subtype=None, print_progress=False):
    if os.path.isdir(source) and os.path.isdir(target):
        from glob import iglob
        if source[-1] != '/':
            source += '/'
        for i, filepath in enumerate(iglob(source + '*.wav')):
            filename = os.path.basename(filepath)
            if print_progress:
                printProgress("{}: {}".format(i, filename))
            stereo_to_mono_and_extreme_silence_cropping(
                filepath,
                os.path.join(target, filename)
            )
    else:
        wf, sr = wf_and_sr(source)
        wf = ensure_mono(wf)
        wf = crop_head_and_tail_silence(wf)
        sf.write(data=wf, file=target, samplerate=sr, subtype=subtype)
Beispiel #7
0
def stereo_to_mono_and_extreme_silence_cropping(source,
                                                target,
                                                subtype=None,
                                                print_progress=False):
    if os.path.isdir(source) and os.path.isdir(target):
        from glob import iglob
        if source[-1] != '/':
            source += '/'
        for i, filepath in enumerate(iglob(source + '*.wav')):
            filename = os.path.basename(filepath)
            if print_progress:
                printProgress("{}: {}".format(i, filename))
            stereo_to_mono_and_extreme_silence_cropping(
                filepath, os.path.join(target, filename))
    else:
        wf, sr = wf_and_sr(source)
        wf = ensure_mono(wf)
        wf = crop_head_and_tail_silence(wf)
        sf.write(target, wf, samplerate=sr, subtype=subtype)
Beispiel #8
0
def to_kw_tokens(dat, indexColName, strColName, data_folder=''):
    """
    input: daf of strings
    output: series of the tokens of the strings, processed for AdWords keywords
      i.e. string is lower capsed and asciied, and words are [\w&]+
    """
    from ut.util import log
    import ut.pstr.trans
    from ut.daf.get import get_data
    dat = get_data(dat, data_folder)
    log.printProgress("making {} tokens",strColName)
    sr = dat[strColName]
    sr.index = dat.hotel_id.tolist()
    # preprocess string
    sr = sr.map(lambda x: x.lower())
    sr = sr.map(lambda x: ut.pstr.trans.toascii(x))
    # tokenize
    sr = sr.map(lambda x:nltk.regexp_tokenize(x,'[\w&]+'))
    # return this
    return sr
Beispiel #9
0
Datei: get.py Projekt: yz-/ut
def get_data(dat, data_folder=''):
# input: dat (a csv pfile, a data pfile, or the data itself
# output: load data
    if isinstance(dat,str): # if input dat is a string
        root,name,ext = fileparts(dat)
        if root: # if root is not empty
            data_folder = root
        dataFile = data_file(dat,data_folder)
        if dataFile:
            df = pd.load(dataFile)
        else:
            delimFile = delim_file(dat)
            if delimFile:
                log.printProgress('csv->DataFrame')
                df = pd.read_csv(delimFile)
            else:
                raise NameError('FileNotFound')
        return df
    else: # assume isinstance(dat,pd.DataFrame) or isinstance(dat,pd.Series)
        return dat
Beispiel #10
0
def get_data(dat, data_folder=''):
# input: dat (a csv pfile, a data pfile, or the data itself
# output: load data
    if isinstance(dat,str): # if input dat is a string
        root,name,ext = fileparts(dat)
        if root: # if root is not empty
            data_folder = root
        dataFile = data_file(dat,data_folder)
        if dataFile:
            df = pd.load(dataFile)
        else:
            delimFile = delim_file(dat)
            if delimFile:
                log.printProgress('csv->DataFrame')
                df = pd.read_csv(delimFile)
            else:
                raise NameError('FileNotFound')
        return df
    else: # assume isinstance(dat,pd.DataFrame) or isinstance(dat,pd.Series)
        return dat
Beispiel #11
0
def to_kw_tokens(dat, indexColName, strColName, data_folder=''):
    """
    input: daf of strings
    output: series of the tokens of the strings, processed for AdWords keywords
      i.e. string is lower capsed and asciied, and words are [\w&]+
    """
    from ut.util import log
    import ut.pstr.trans
    from ut.daf.get import get_data
    dat = get_data(dat, data_folder)
    log.printProgress("making {} tokens",strColName)
    sr = dat[strColName]
    sr.index = dat.hotel_id.tolist()
    # preprocess string
    sr = sr.map(lambda x: x.lower())
    sr = sr.map(lambda x: ut.pstr.trans.toascii(x))
    # tokenize
    sr = sr.map(lambda x:nltk.regexp_tokenize(x,'[\w&]+'))
    # return this
    return sr
Beispiel #12
0
def _import_data_into_mongo(filepath='gl_centroids_utf8.csv',
                            mongo_db='util',
                            mongo_collection='geo_pop_density',
                            index_precision_meters=76.8,
                            print_mongo_import_progress_every=50000):

    bits = _meters_to_bits(index_precision_meters)
    printProgress("importing %s into dataframe" % filepath)
    d = pd.read_csv(filepath, header=0, sep=',', quotechar="'")
    space_re = re.compile('\s')
    d.columns = [space_re.sub('_',
                              str(x).lower())
                 for x in d.columns]  # I want lower and no-space_columns

    printProgress(
        "importing dataframe rows into mongo (will print progress every %d items"
        % print_mongo_import_progress_every)
    mc = MongoClient()
    db = mc[mongo_db]
    db.drop_collection(mongo_collection)
    db.create_collection(mongo_collection)
    mg_collection = db[mongo_collection]

    n = len(d)
    for i, di in enumerate(d.iterrows()):
        ddi = _process_dict(dict(di[1]))
        if fmod(i, print_mongo_import_progress_every) == 0:
            printProgress("  %d/%d" % (i, n))
        try:
            mg_collection.insert(ddi, w=0)
        except InvalidStringData:
            ddi = {k: str_to_utf8_or_bust(v) for k, v in ddi.iteritems()}
            mg_collection.insert(ddi, w=0)

    printProgress(
        "ensuring GEOSPHERE index with %d bits (for a precision of %d meters or more"
        % (bits, index_precision_meters))
    from pymongo import GEOSPHERE
    mg_collection.ensure_index([("cen", GEOSPHERE), ("bits", bits)])
    printProgress(
        "------------------------------ DONE ------------------------------")
Beispiel #13
0
Datei: util.py Projekt: yz-/ut
def copy_collection_from_remote_to_local(remote_client, remote_db, remote_collection,
                                         local_db=None, local_collection=None,
                                         max_docs_per_collection=inf, verbose=False):
    local_db = local_db or remote_db
    local_collection = local_collection or remote_collection

    remote_collection_connection = remote_client[remote_db][remote_collection]

    local_db_connection = mg.MongoClient()[local_db]
    if local_collection in local_db_connection.collection_names():
        print("Local collection '{}' existed and is being deleted".format(local_collection))
        try:
            local_db_connection[local_collection].drop()
        except mg.errors.OperationFailure as e:
            print("  !!! Nope, can't delete that: {}".format(e.message))
    local_collection_connection = local_db_connection[local_collection]
    for i, d in enumerate(remote_collection_connection.find()):
        if i < max_docs_per_collection:
            if verbose:
                printProgress("item {}".format(i))
            local_collection_connection.insert(d)
        else:
            break
Beispiel #14
0
def copy_collection_from_remote_to_local(remote_client, remote_db, remote_collection,
                                         local_db=None, local_collection=None,
                                         max_docs_per_collection=inf, verbose=False):
    local_db = local_db or remote_db
    local_collection = local_collection or remote_collection

    remote_collection_connection = remote_client[remote_db][remote_collection]

    local_db_connection = mg.MongoClient()[local_db]
    if local_collection in local_db_connection.collection_names():
        print("Local collection '{}' existed and is being deleted".format(local_collection))
        try:
            local_db_connection[local_collection].drop()
        except mg.errors.OperationFailure as e:
            print("  !!! Nope, can't delete that: {}".format(e.message))
    local_collection_connection = local_db_connection[local_collection]
    for i, d in enumerate(remote_collection_connection.find()):
        if i < max_docs_per_collection:
            if verbose:
                printProgress("item {}".format(i))
            local_collection_connection.insert(d)
        else:
            break
Beispiel #15
0
def get_ftp_files_I_dont_have(ftp_kwargs,
                              remote_dir=".",
                              local_dir=".",
                              remote_filename_filter=None):
    """
    Getting files from a remote ftp folder to a local folder
    """
    ftp = FTP(**ftp_kwargs)

    # getting a (possibly filtered) list of remote files
    ftp_files = ftp.nlst(remote_dir)
    if remote_filename_filter is not None:
        printProgress('Remote files: %d' % len(ftp_files))
        try:
            ftp_files = filter(remote_filename_filter, ftp_files)
        except Exception:
            ftp_files = remote_filename_filter(ftp_files)
        printProgress(
            '... After filtering, only %d files left that will be processed' %
            len(ftp_files))
    else:
        printProgress('Remote files: %d' % len(ftp_files))

    list_of_files_to_fetch = []
    for f in ftp_files:
        local_filepath = os.path.join(local_dir, f)
        if not os.path.exists(local_filepath):
            list_of_files_to_fetch.append([f])

    # for each of the files that local_directory doesn't have, retrieve it and store it locally
    printProgress("Fetching the %d files that (%s) didn't have" % \
                  (len(list_of_files_to_fetch), local_dir))
    for f in list_of_files_to_fetch:
        local_filepath = os.path.join(local_dir, f)
        remote_filepath = os.path.join(remote_dir, f)
        printProgress('Getting and storing file to %s' % local_filepath)
        get_file_from_ftp(ftp, remote_filepath, local_filepath)
Beispiel #16
0
Datei: geopop.py Projekt: yz-/ut
def _import_data_into_mongo(filepath='gl_centroids_utf8.csv',
                            mongo_db='util',
                            mongo_collection='geo_pop_density',
                            index_precision_meters=76.8,
                            print_mongo_import_progress_every=50000):

    bits = _meters_to_bits(index_precision_meters)
    printProgress("importing %s into dataframe" % filepath)
    d = pd.read_csv(filepath, header=0, sep=',', quotechar="'")
    space_re = re.compile('\s')
    d.columns = [space_re.sub('_', str(x).lower()) for x in d.columns]  # I want lower and no-space_columns

    printProgress("importing dataframe rows into mongo (will print progress every %d items"
                  % print_mongo_import_progress_every)
    mc = MongoClient()
    db = mc[mongo_db]
    db.drop_collection(mongo_collection)
    db.create_collection(mongo_collection)
    mg_collection = db[mongo_collection]

    n = len(d)
    for i, di in enumerate(d.iterrows()):
        ddi = _process_dict(dict(di[1]))
        if fmod(i, print_mongo_import_progress_every) == 0:
            printProgress("  %d/%d" % (i, n))
        try:
            mg_collection.insert(ddi, w=0)
        except InvalidStringData:
            ddi = {k: str_to_utf8_or_bust(v) for k, v in ddi.iteritems()}
            mg_collection.insert(ddi, w=0)

    printProgress("ensuring GEOSPHERE index with %d bits (for a precision of %d meters or more"
                  % (bits, index_precision_meters))
    from pymongo import GEOSPHERE
    mg_collection.ensure_index([("cen", GEOSPHERE), ("bits", bits)])
    printProgress("------------------------------ DONE ------------------------------")
Beispiel #17
0
 def print_progress(self, min_level, msg='', verbose_level=None):
     verbose_level = verbose_level or self.verbose_level
     if verbose_level >= min_level:
         msg = 2 * min_level * ' ' + msg
         util_log.printProgress(msg)
Beispiel #18
0
def try_out_multiple_classifiers(datasets, classifiers=None, print_progress=True, **kwargs):
    h = .02  # step size in the mesh

    # classifier_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
    #          "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]

    if isinstance(classifiers, int):
        classifiers = default_classifiers[:(classifiers+1)]
    else:
        classifiers = classifiers or default_classifiers
    classifier_names = map(lambda x: str(x.__class__).split('.')[-1][:-2], classifiers)

    if datasets is None:
        X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                                   random_state=1, n_clusters_per_class=1)
        rng = np.random.RandomState(2)
        X += 2 * rng.uniform(size=X.shape)
        linearly_separable = (X, y)
        datasets = [make_moons(noise=0.3, random_state=0),
                    make_circles(noise=0.2, factor=0.5, random_state=1),
                    linearly_separable
                    ]
    elif not np.lib.function_base.iterable(datasets):
        datasets = [datasets]

    try:  # getting num_of_datasets
        num_of_datasets = len(datasets)
    except TypeError:  # if datasets is an iterable, there should be a kwargs['num_of_datasets']
        num_of_datasets = kwargs['num_of_datasets']


    if kwargs.get('dataset_names'):
        dataset_names = kwargs['dataset_names']
        if isinstance(dataset_names, list):
            assert len(dataset_names) == len(datasets), \
                "You should have the same number of dataset names as there are datasets"
        dataset_names = iter(dataset_names)
    else:
        dataset_names = itertools.imap(lambda x: "Dataset #%d" % x, itertools.count())
        # dataset_names = map(lambda x: "Dataset #%d" % x, xrange(len(datasets)))

    figsize_multiplier = 3
    figure = pl.figure(figsize=((len(classifiers) + 1) * figsize_multiplier, num_of_datasets * figsize_multiplier))

    # ax_list = list()
    i = 1
    row_num = 0
    col_num = 0
    # iterate over datasets
    for dataset_num, ds in enumerate(datasets):
        row_num += 1
        col_num += 1
        this_dataset_name = dataset_names.next()
        if print_progress:
            printProgress('----- %s -----' % this_dataset_name)
        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # just plot the dataset first
        cm = pl.cm.RdBu
        cm_bright = kwargs.get('cm_bright', ListedColormap(['#FF0000', '#0000FF']))
        ax = pl.subplot(num_of_datasets, len(classifiers) + 1, i)
        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1
        # ax_list.append(ax)
        plt.ylabel(this_dataset_name)
        # iterate over classifiers
        for name, clf in zip(classifier_names, classifiers):
            col_num += 1
            if print_progress:
                printProgress('    %s' % name)
            ax = pl.subplot(num_of_datasets, len(classifiers) + 1, i)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot also the training points
            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
            # and testing points
            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                       alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            # ax.set_title(name)
            ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=15, horizontalalignment='right')
            i += 1
            # ax_list.append(ax)
            if row_num == num_of_datasets:
                plt.xlabel(name)


    figure.subplots_adjust(left=.02, right=.98)
    pl.show()
Beispiel #19
0
def test_classifiers(X, y,
                 scoring=default_scorers,
                 score_aggreg=default_score_aggreg,
                 n_features=7,  # an int will be transformed to a list (with different num of features) of given size
                 clfs=None,
                 nfolds=10,
                 scale=None,
                 decompose=None,
                 select=None,
                 decompose_params={},
                 print_progress=False,
                 score_to_plot=None
                 ):
    """
    tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of
    features, returning a pandas DataFrame of the results.
    """
    scoring = scoring or default_scorers
    score_aggreg = score_aggreg or default_score_aggreg

    if isinstance(n_features, int):  # if n_features is an int, it's the number of different feature set lens to try out
        # ... so make this feature set len list
        total_n_features = np.shape(X)[1]
        n_features = range(1, total_n_features + 1, np.floor(total_n_features / n_features))[:n_features]
    y = np.asarray(y, dtype="|S6")
    n_features = np.array(n_features)

    if clfs is None:
        clfs = default_classifiers

    clfs = clfs_to_dict_clfs(clfs)

    general_info_dict = dict()
    if scale is not None and scale is not False:  # preprocessing.StandardScaler(), preprocessing.MinMaxScaler()
        if scale is True:
            scale = preprocessing.StandardScaler()
        general_info_dict['scale'] = get_name(scale)
    if decompose is not None and decompose is not False:
        if decompose is True:
            decompose = decomposition.PCA(**decompose_params)  # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD
        general_info_dict['decompose'] = get_name(decompose)

    clf_results = list()

    for i_nfeats, nfeats in enumerate(n_features):
        for i_clf, clf in enumerate(clfs):
            clf_name = clf.keys()[0]
            clf = clf[clf_name]
            d = dict(general_info_dict, **{'model': clf_name, 'nfeats': nfeats})
            if print_progress:
                printProgress("{}: nfeats={}, nfolds={}".format(
                    clf_name,
                    n_features[i_nfeats],
                    nfolds))
            # try:
            start_time = datetime.now()
            score_result = \
                score_classifier(X,
                   y,
                   clf=clf,
                   nfeats=nfeats,
                   scoring=scoring,
                   score_aggreg=score_aggreg,
                   nfolds=nfolds,
                   scale=scale,
                   decompose=decompose,
                   select=select,
                   decompose_params=decompose_params)
            d.update({'seconds': (datetime.now() - start_time).total_seconds()})
            d.update(score_result.to_dict())
            # except ValueError as e:
            #     raise e
            #     print("Error with: {} ({} features)".format(get_name(clf),
            #                                         n_features[i_nfeats]))

            clf_results.append(d)  # accumulate results

    clf_results = pd.DataFrame(clf_results)
    if score_to_plot:
        if score_to_plot is True:
            score_to_plot = mk_aggreg_score_name(score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0],
                                                 score_name=mk_scoring_dict(scoring).keys()[0])
        plot_score(clf_results, score_to_plot)

    return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
Beispiel #20
0
 def print_progress(self, min_level, msg='', verbose_level=None):
     verbose_level = verbose_level or self.verbose_level
     if verbose_level >= min_level:
         msg = 2 * min_level * ' ' + msg
         util_log.printProgress(msg)
Beispiel #21
0
 def put_in_attr(self, name, data):
     util_log.printProgress('  Storing %s in attribute' % name)
     setattr(self, name, data)
Beispiel #22
0
 def put_in_store(self, name, data):
     util_log.printProgress('  Storing %s in store' % name)
     self.store.put(name, data)
Beispiel #23
0
def test_classifiers(
        X,
        y,
        scoring=default_scorers,
        score_aggreg=default_score_aggreg,
        n_features=7,  # an int will be transformed to a list (with different num of features) of given size
        clfs=None,
        nfolds=10,
        scale=None,
        decompose=None,
        select=None,
        decompose_params={},
        print_progress=False,
        score_to_plot=None):
    """
    tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of
    features, returning a pandas DataFrame of the results.
    """
    scoring = scoring or default_scorers
    score_aggreg = score_aggreg or default_score_aggreg

    if isinstance(
            n_features, int
    ):  # if n_features is an int, it's the number of different feature set lens to try out
        # ... so make this feature set len list
        total_n_features = np.shape(X)[1]
        n_features = range(1, total_n_features + 1,
                           int(np.floor(total_n_features /
                                        n_features)))[:n_features]
    y = np.asarray(y, dtype="|S6")
    n_features = np.array(n_features)

    if clfs is None:
        clfs = default_classifiers

    clfs = clfs_to_dict_clfs(clfs)

    general_info_dict = dict()
    if scale is not None and scale is not False:  # preprocessing.StandardScaler(), preprocessing.MinMaxScaler()
        if scale is True:
            scale = preprocessing.StandardScaler()
        general_info_dict['scale'] = get_name(scale)
    if decompose is not None and decompose is not False:
        if decompose is True:
            decompose = decomposition.PCA(
                **decompose_params
            )  # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD
        general_info_dict['decompose'] = get_name(decompose)

    clf_results = list()

    for i_nfeats, nfeats in enumerate(n_features):
        for i_clf, clf in enumerate(clfs):
            clf_name = clf.keys()[0]
            clf = clf[clf_name]
            d = dict(general_info_dict, **{
                'model': clf_name,
                'nfeats': nfeats
            })
            if print_progress:
                printProgress("{}: nfeats={}, nfolds={}".format(
                    clf_name, n_features[i_nfeats], nfolds))
            # try:
            start_time = datetime.now()
            score_result = \
                score_classifier(X,
                   y,
                   clf=clf,
                   nfeats=nfeats,
                   scoring=scoring,
                   score_aggreg=score_aggreg,
                   nfolds=nfolds,
                   scale=scale,
                   decompose=decompose,
                   select=select,
                   decompose_params=decompose_params)
            d.update(
                {'seconds': (datetime.now() - start_time).total_seconds()})
            d.update(score_result.to_dict())
            # except ValueError as e:
            #     raise e
            #     print("Error with: {} ({} features)".format(get_name(clf),
            #                                         n_features[i_nfeats]))

            clf_results.append(d)  # accumulate results

    clf_results = pd.DataFrame(clf_results)
    if score_to_plot:
        if score_to_plot is True:
            score_to_plot = mk_aggreg_score_name(
                score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0],
                score_name=mk_scoring_dict(scoring).keys()[0])
        plot_score(clf_results, score_to_plot)

    return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
Beispiel #24
0
 def put_in_attr(self, name, data):
     util_log.printProgress('  Storing %s in attribute' % name)
     setattr(self, name, data)
Beispiel #25
0
 def put_in_store(self, name, data):
     util_log.printProgress('  Storing %s in store' % name)
     self.store.put(name, data)