def partial_leave_one_out_test(model, X, y, n_splits=None, min_n_samples_per_unik_y=None, verbose=1): loo = SupervisedLeaveOneOut( n_splits=n_splits, min_n_samples_per_unik_y=min_n_samples_per_unik_y) n_splits = loo.get_n_splits() if verbose > 0: printProgress("Number of tests: {}".format(n_splits)) predicted = list() actual = list() for i, (train_idx, test_idx) in enumerate(loo.split(X, y), 1): XX = X[train_idx, :] yy = y[train_idx] if verbose > 0: printProgress('Test {}/{}'.format(i, n_splits), refresh=True, refresh_suffix=' ') model.fit(XX, yy) test_x = X[test_idx, :] test_y = y[test_idx] actual.append(test_y[0]) predicted.append(model.predict(test_x)[0]) return array(predicted), array(actual)
def dict_of_source_data(self, extractor=None, start=None, stop=None, print_progress_every=None, source_scan_iterator_kwargs={}): source_scan_iterator_kwargs[ 'scroll'] = source_scan_iterator_kwargs.get('scroll', '10m') extracting_scanner = self.source_scan_iterator( extractor=extractor, start=start, stop=stop, **source_scan_iterator_kwargs) print_progress_every = print_progress_every or np.inf start = start or 0 d = list() for i, item in enumerate(extracting_scanner, start=start): if np.mod(i, print_progress_every) == 0: printProgress("offset: {}".format(i)) if item is not None: d.append(item) return d
def get_ftp_files_I_dont_have(ftp_kwargs, remote_dir=".", local_dir=".", remote_filename_filter=None): """ Getting files from a remote ftp folder to a local folder """ ftp = FTP(**ftp_kwargs) # getting a (possibly filtered) list of remote files ftp_files = ftp.nlst(remote_dir) if remote_filename_filter is not None: printProgress('Remote files: %d' % len(ftp_files)) try: ftp_files = filter(remote_filename_filter, ftp_files) except Exception: ftp_files = remote_filename_filter(ftp_files) printProgress('... After filtering, only %d files left that will be processed' % len(ftp_files)) else: printProgress('Remote files: %d' % len(ftp_files)) list_of_files_to_fetch = [] for f in ftp_files: local_filepath = os.path.join(local_dir, f) if not os.path.exists(local_filepath): list_of_files_to_fetch.append([f]) # for each of the files that local_directory doesn't have, retrieve it and store it locally printProgress("Fetching the %d files that (%s) didn't have" % \ (len(list_of_files_to_fetch), local_dir)) for f in list_of_files_to_fetch: local_filepath = os.path.join(local_dir, f) remote_filepath = os.path.join(remote_dir, f) printProgress('Getting and storing file to %s' % local_filepath) get_file_from_ftp(ftp, remote_filepath, local_filepath)
def mk_termCounts(dat, indexColName, strColName, data_folder=''): """ input: data_folder='', dataname, savename='', output: string of ascii char correspondents (replacing, for example, accentuated letters with non-accentuated versions of the latter) """ from ut.util import log from ut.daf.get import get_data dat = get_data(dat,data_folder) log.printProgress("making {} word counts (wc)",strColName) sr = to_kw_fd(dat[strColName]) sr.index = dat.hotel_id.tolist() return sr
def stereo_to_mono_and_extreme_silence_cropping(source, target, subtype=None, print_progress=False): if os.path.isdir(source) and os.path.isdir(target): from glob import iglob if source[-1] != '/': source += '/' for i, filepath in enumerate(iglob(source + '*.wav')): filename = os.path.basename(filepath) if print_progress: printProgress("{}: {}".format(i, filename)) stereo_to_mono_and_extreme_silence_cropping( filepath, os.path.join(target, filename) ) else: wf, sr = wf_and_sr(source) wf = ensure_mono(wf) wf = crop_head_and_tail_silence(wf) sf.write(data=wf, file=target, samplerate=sr, subtype=subtype)
def stereo_to_mono_and_extreme_silence_cropping(source, target, subtype=None, print_progress=False): if os.path.isdir(source) and os.path.isdir(target): from glob import iglob if source[-1] != '/': source += '/' for i, filepath in enumerate(iglob(source + '*.wav')): filename = os.path.basename(filepath) if print_progress: printProgress("{}: {}".format(i, filename)) stereo_to_mono_and_extreme_silence_cropping( filepath, os.path.join(target, filename)) else: wf, sr = wf_and_sr(source) wf = ensure_mono(wf) wf = crop_head_and_tail_silence(wf) sf.write(target, wf, samplerate=sr, subtype=subtype)
def to_kw_tokens(dat, indexColName, strColName, data_folder=''): """ input: daf of strings output: series of the tokens of the strings, processed for AdWords keywords i.e. string is lower capsed and asciied, and words are [\w&]+ """ from ut.util import log import ut.pstr.trans from ut.daf.get import get_data dat = get_data(dat, data_folder) log.printProgress("making {} tokens",strColName) sr = dat[strColName] sr.index = dat.hotel_id.tolist() # preprocess string sr = sr.map(lambda x: x.lower()) sr = sr.map(lambda x: ut.pstr.trans.toascii(x)) # tokenize sr = sr.map(lambda x:nltk.regexp_tokenize(x,'[\w&]+')) # return this return sr
def get_data(dat, data_folder=''): # input: dat (a csv pfile, a data pfile, or the data itself # output: load data if isinstance(dat,str): # if input dat is a string root,name,ext = fileparts(dat) if root: # if root is not empty data_folder = root dataFile = data_file(dat,data_folder) if dataFile: df = pd.load(dataFile) else: delimFile = delim_file(dat) if delimFile: log.printProgress('csv->DataFrame') df = pd.read_csv(delimFile) else: raise NameError('FileNotFound') return df else: # assume isinstance(dat,pd.DataFrame) or isinstance(dat,pd.Series) return dat
def _import_data_into_mongo(filepath='gl_centroids_utf8.csv', mongo_db='util', mongo_collection='geo_pop_density', index_precision_meters=76.8, print_mongo_import_progress_every=50000): bits = _meters_to_bits(index_precision_meters) printProgress("importing %s into dataframe" % filepath) d = pd.read_csv(filepath, header=0, sep=',', quotechar="'") space_re = re.compile('\s') d.columns = [space_re.sub('_', str(x).lower()) for x in d.columns] # I want lower and no-space_columns printProgress( "importing dataframe rows into mongo (will print progress every %d items" % print_mongo_import_progress_every) mc = MongoClient() db = mc[mongo_db] db.drop_collection(mongo_collection) db.create_collection(mongo_collection) mg_collection = db[mongo_collection] n = len(d) for i, di in enumerate(d.iterrows()): ddi = _process_dict(dict(di[1])) if fmod(i, print_mongo_import_progress_every) == 0: printProgress(" %d/%d" % (i, n)) try: mg_collection.insert(ddi, w=0) except InvalidStringData: ddi = {k: str_to_utf8_or_bust(v) for k, v in ddi.iteritems()} mg_collection.insert(ddi, w=0) printProgress( "ensuring GEOSPHERE index with %d bits (for a precision of %d meters or more" % (bits, index_precision_meters)) from pymongo import GEOSPHERE mg_collection.ensure_index([("cen", GEOSPHERE), ("bits", bits)]) printProgress( "------------------------------ DONE ------------------------------")
def copy_collection_from_remote_to_local(remote_client, remote_db, remote_collection, local_db=None, local_collection=None, max_docs_per_collection=inf, verbose=False): local_db = local_db or remote_db local_collection = local_collection or remote_collection remote_collection_connection = remote_client[remote_db][remote_collection] local_db_connection = mg.MongoClient()[local_db] if local_collection in local_db_connection.collection_names(): print("Local collection '{}' existed and is being deleted".format(local_collection)) try: local_db_connection[local_collection].drop() except mg.errors.OperationFailure as e: print(" !!! Nope, can't delete that: {}".format(e.message)) local_collection_connection = local_db_connection[local_collection] for i, d in enumerate(remote_collection_connection.find()): if i < max_docs_per_collection: if verbose: printProgress("item {}".format(i)) local_collection_connection.insert(d) else: break
def get_ftp_files_I_dont_have(ftp_kwargs, remote_dir=".", local_dir=".", remote_filename_filter=None): """ Getting files from a remote ftp folder to a local folder """ ftp = FTP(**ftp_kwargs) # getting a (possibly filtered) list of remote files ftp_files = ftp.nlst(remote_dir) if remote_filename_filter is not None: printProgress('Remote files: %d' % len(ftp_files)) try: ftp_files = filter(remote_filename_filter, ftp_files) except Exception: ftp_files = remote_filename_filter(ftp_files) printProgress( '... After filtering, only %d files left that will be processed' % len(ftp_files)) else: printProgress('Remote files: %d' % len(ftp_files)) list_of_files_to_fetch = [] for f in ftp_files: local_filepath = os.path.join(local_dir, f) if not os.path.exists(local_filepath): list_of_files_to_fetch.append([f]) # for each of the files that local_directory doesn't have, retrieve it and store it locally printProgress("Fetching the %d files that (%s) didn't have" % \ (len(list_of_files_to_fetch), local_dir)) for f in list_of_files_to_fetch: local_filepath = os.path.join(local_dir, f) remote_filepath = os.path.join(remote_dir, f) printProgress('Getting and storing file to %s' % local_filepath) get_file_from_ftp(ftp, remote_filepath, local_filepath)
def _import_data_into_mongo(filepath='gl_centroids_utf8.csv', mongo_db='util', mongo_collection='geo_pop_density', index_precision_meters=76.8, print_mongo_import_progress_every=50000): bits = _meters_to_bits(index_precision_meters) printProgress("importing %s into dataframe" % filepath) d = pd.read_csv(filepath, header=0, sep=',', quotechar="'") space_re = re.compile('\s') d.columns = [space_re.sub('_', str(x).lower()) for x in d.columns] # I want lower and no-space_columns printProgress("importing dataframe rows into mongo (will print progress every %d items" % print_mongo_import_progress_every) mc = MongoClient() db = mc[mongo_db] db.drop_collection(mongo_collection) db.create_collection(mongo_collection) mg_collection = db[mongo_collection] n = len(d) for i, di in enumerate(d.iterrows()): ddi = _process_dict(dict(di[1])) if fmod(i, print_mongo_import_progress_every) == 0: printProgress(" %d/%d" % (i, n)) try: mg_collection.insert(ddi, w=0) except InvalidStringData: ddi = {k: str_to_utf8_or_bust(v) for k, v in ddi.iteritems()} mg_collection.insert(ddi, w=0) printProgress("ensuring GEOSPHERE index with %d bits (for a precision of %d meters or more" % (bits, index_precision_meters)) from pymongo import GEOSPHERE mg_collection.ensure_index([("cen", GEOSPHERE), ("bits", bits)]) printProgress("------------------------------ DONE ------------------------------")
def print_progress(self, min_level, msg='', verbose_level=None): verbose_level = verbose_level or self.verbose_level if verbose_level >= min_level: msg = 2 * min_level * ' ' + msg util_log.printProgress(msg)
def try_out_multiple_classifiers(datasets, classifiers=None, print_progress=True, **kwargs): h = .02 # step size in the mesh # classifier_names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", # "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] if isinstance(classifiers, int): classifiers = default_classifiers[:(classifiers+1)] else: classifiers = classifiers or default_classifiers classifier_names = map(lambda x: str(x.__class__).split('.')[-1][:-2], classifiers) if datasets is None: X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] elif not np.lib.function_base.iterable(datasets): datasets = [datasets] try: # getting num_of_datasets num_of_datasets = len(datasets) except TypeError: # if datasets is an iterable, there should be a kwargs['num_of_datasets'] num_of_datasets = kwargs['num_of_datasets'] if kwargs.get('dataset_names'): dataset_names = kwargs['dataset_names'] if isinstance(dataset_names, list): assert len(dataset_names) == len(datasets), \ "You should have the same number of dataset names as there are datasets" dataset_names = iter(dataset_names) else: dataset_names = itertools.imap(lambda x: "Dataset #%d" % x, itertools.count()) # dataset_names = map(lambda x: "Dataset #%d" % x, xrange(len(datasets))) figsize_multiplier = 3 figure = pl.figure(figsize=((len(classifiers) + 1) * figsize_multiplier, num_of_datasets * figsize_multiplier)) # ax_list = list() i = 1 row_num = 0 col_num = 0 # iterate over datasets for dataset_num, ds in enumerate(datasets): row_num += 1 col_num += 1 this_dataset_name = dataset_names.next() if print_progress: printProgress('----- %s -----' % this_dataset_name) # preprocess dataset, split into training and test part X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first cm = pl.cm.RdBu cm_bright = kwargs.get('cm_bright', ListedColormap(['#FF0000', '#0000FF'])) ax = pl.subplot(num_of_datasets, len(classifiers) + 1, i) # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # ax_list.append(ax) plt.ylabel(this_dataset_name) # iterate over classifiers for name, clf in zip(classifier_names, classifiers): col_num += 1 if print_progress: printProgress(' %s' % name) ax = pl.subplot(num_of_datasets, len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) # and testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) # ax.set_title(name) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') i += 1 # ax_list.append(ax) if row_num == num_of_datasets: plt.xlabel(name) figure.subplots_adjust(left=.02, right=.98) pl.show()
def test_classifiers(X, y, scoring=default_scorers, score_aggreg=default_score_aggreg, n_features=7, # an int will be transformed to a list (with different num of features) of given size clfs=None, nfolds=10, scale=None, decompose=None, select=None, decompose_params={}, print_progress=False, score_to_plot=None ): """ tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of features, returning a pandas DataFrame of the results. """ scoring = scoring or default_scorers score_aggreg = score_aggreg or default_score_aggreg if isinstance(n_features, int): # if n_features is an int, it's the number of different feature set lens to try out # ... so make this feature set len list total_n_features = np.shape(X)[1] n_features = range(1, total_n_features + 1, np.floor(total_n_features / n_features))[:n_features] y = np.asarray(y, dtype="|S6") n_features = np.array(n_features) if clfs is None: clfs = default_classifiers clfs = clfs_to_dict_clfs(clfs) general_info_dict = dict() if scale is not None and scale is not False: # preprocessing.StandardScaler(), preprocessing.MinMaxScaler() if scale is True: scale = preprocessing.StandardScaler() general_info_dict['scale'] = get_name(scale) if decompose is not None and decompose is not False: if decompose is True: decompose = decomposition.PCA(**decompose_params) # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD general_info_dict['decompose'] = get_name(decompose) clf_results = list() for i_nfeats, nfeats in enumerate(n_features): for i_clf, clf in enumerate(clfs): clf_name = clf.keys()[0] clf = clf[clf_name] d = dict(general_info_dict, **{'model': clf_name, 'nfeats': nfeats}) if print_progress: printProgress("{}: nfeats={}, nfolds={}".format( clf_name, n_features[i_nfeats], nfolds)) # try: start_time = datetime.now() score_result = \ score_classifier(X, y, clf=clf, nfeats=nfeats, scoring=scoring, score_aggreg=score_aggreg, nfolds=nfolds, scale=scale, decompose=decompose, select=select, decompose_params=decompose_params) d.update({'seconds': (datetime.now() - start_time).total_seconds()}) d.update(score_result.to_dict()) # except ValueError as e: # raise e # print("Error with: {} ({} features)".format(get_name(clf), # n_features[i_nfeats])) clf_results.append(d) # accumulate results clf_results = pd.DataFrame(clf_results) if score_to_plot: if score_to_plot is True: score_to_plot = mk_aggreg_score_name(score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0], score_name=mk_scoring_dict(scoring).keys()[0]) plot_score(clf_results, score_to_plot) return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
def put_in_attr(self, name, data): util_log.printProgress(' Storing %s in attribute' % name) setattr(self, name, data)
def put_in_store(self, name, data): util_log.printProgress(' Storing %s in store' % name) self.store.put(name, data)
def test_classifiers( X, y, scoring=default_scorers, score_aggreg=default_score_aggreg, n_features=7, # an int will be transformed to a list (with different num of features) of given size clfs=None, nfolds=10, scale=None, decompose=None, select=None, decompose_params={}, print_progress=False, score_to_plot=None): """ tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of features, returning a pandas DataFrame of the results. """ scoring = scoring or default_scorers score_aggreg = score_aggreg or default_score_aggreg if isinstance( n_features, int ): # if n_features is an int, it's the number of different feature set lens to try out # ... so make this feature set len list total_n_features = np.shape(X)[1] n_features = range(1, total_n_features + 1, int(np.floor(total_n_features / n_features)))[:n_features] y = np.asarray(y, dtype="|S6") n_features = np.array(n_features) if clfs is None: clfs = default_classifiers clfs = clfs_to_dict_clfs(clfs) general_info_dict = dict() if scale is not None and scale is not False: # preprocessing.StandardScaler(), preprocessing.MinMaxScaler() if scale is True: scale = preprocessing.StandardScaler() general_info_dict['scale'] = get_name(scale) if decompose is not None and decompose is not False: if decompose is True: decompose = decomposition.PCA( **decompose_params ) # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD general_info_dict['decompose'] = get_name(decompose) clf_results = list() for i_nfeats, nfeats in enumerate(n_features): for i_clf, clf in enumerate(clfs): clf_name = clf.keys()[0] clf = clf[clf_name] d = dict(general_info_dict, **{ 'model': clf_name, 'nfeats': nfeats }) if print_progress: printProgress("{}: nfeats={}, nfolds={}".format( clf_name, n_features[i_nfeats], nfolds)) # try: start_time = datetime.now() score_result = \ score_classifier(X, y, clf=clf, nfeats=nfeats, scoring=scoring, score_aggreg=score_aggreg, nfolds=nfolds, scale=scale, decompose=decompose, select=select, decompose_params=decompose_params) d.update( {'seconds': (datetime.now() - start_time).total_seconds()}) d.update(score_result.to_dict()) # except ValueError as e: # raise e # print("Error with: {} ({} features)".format(get_name(clf), # n_features[i_nfeats])) clf_results.append(d) # accumulate results clf_results = pd.DataFrame(clf_results) if score_to_plot: if score_to_plot is True: score_to_plot = mk_aggreg_score_name( score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0], score_name=mk_scoring_dict(scoring).keys()[0]) plot_score(clf_results, score_to_plot) return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])