def extract_group_components(subject_components, variances, ccs_threshold=None, n_group_components=None, cachedir=None): # Use asarray to cast to a non memmapped array subject_components = np.asarray(subject_components) if len(subject_components) == 1: # We are in a single subject case return subject_components[0, :n_group_components].T, \ variances[0][:n_group_components] # The group components (concatenated subject components) group_components = subject_components.T group_components = np.reshape(group_components, (group_components.shape[0], -1)) # Save memory del subject_components # Inter-subject CCA memory = Memory(cachedir=cachedir, mmap_mode='r') svd = memory.cache(linalg.svd) cca_maps, ccs, _ = svd(group_components, full_matrices=False) # Save memory del group_components if n_group_components is None: n_group_components = np.argmin(ccs > ccs_threshold) cca_maps = cca_maps[:, :n_group_components] ccs = ccs[:n_group_components] return cca_maps, ccs
def __init__( self, hmm=None, n_components=16, covariance_type='diag', min_duration=0.250, feature=None, cache=False ): super(SpeechActivityDetection, self).__init__() self.hmm = hmm self.hmm.min_duration = min_duration # default features for speech activity detection # are MFCC (12 coefficients + delta coefficient + delta energy) if feature is None: from pyannote.feature.yaafe import YaafeMFCC feature = YaafeMFCC(e=False, coefs=12, De=True, D=True) self.feature = feature if cache: # initialize cache from joblib import Memory from tempfile import mkdtemp memory = Memory(cachedir=mkdtemp(), verbose=0) # cache feature extraction method self.get_features = memory.cache(self.get_features)
def __init__( self, gmm_ubm, feature=None, cache=False ): super(SpeakerIdentification, self).__init__() self.gmm_ubm = gmm_ubm # default features for speaker identification are MFCC # 13 coefs + delta coefs + delta delta coefs # + delta energy + delta delta energy if feature is None: from pyannote.feature.yaafe import YaafeMFCC feature = YaafeMFCC( e=False, De=True, DDe=True, coefs=13, D=True, DD=True ) self.feature = feature if cache: # initialize cache from joblib import Memory from tempfile import mkdtemp memory = Memory(cachedir=mkdtemp(), verbose=0) # cache feature extraction method self.get_features = memory.cache(self.get_features)
def load_adni_longitudinal_rs_fmri(dirname='ADNI_longitudinal_rs_fmri', prefix='wr*.nii'): """ Returns paths of ADNI rs-fMRI """ # get file paths and description images, subject_paths, description = _get_subjects_and_description( base_dir=dirname, prefix='I[0-9]*') images = np.array(images) # get func files func_files = list(map(lambda x: _glob_subject_img( x, suffix='func/' + prefix, first_img=True), subject_paths)) func_files = np.array(func_files) # get motion files # motions = None motions = list(map(lambda x: _glob_subject_img( x, suffix='func/' + 'rp_*.txt', first_img=True), subject_paths)) # get phenotype from csv dx = pd.read_csv(os.path.join(_get_data_base_dir('ADNI_csv'), 'DXSUM_PDXCONV_ADNIALL.csv')) roster = pd.read_csv(os.path.join(_get_data_base_dir('ADNI_csv'), 'ROSTER.csv')) df = description[description['Image_ID'].isin(images)] df = df.sort_values(by='Image_ID') dx_group = np.array(df['DX_Group']) subjects = np.array(df['Subject_ID']) exams = np.array(df['EXAM_DATE']) exams = [date(int(e[:4]), int(e[5:7]), int(e[8:])) for e in exams] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) def _get_ridsfmri(subjects): return [_ptid_to_rid(s, roster) for s in subjects] rids = np.array(memory.cache(_get_ridsfmri)(subjects)) def _get_examdatesfmri(rids): return [_get_dx(rids[i], dx, exams[i], viscode=None, return_code=True) for i in range(len(rids))] exam_dates = np.array(memory.cache(_get_examdatesfmri)(rids)) def _get_viscodesfmri(rids): return [_get_vcodes(rids[i], str(exam_dates[i]), dx) for i in range(len(rids))] viscodes = np.array(memory.cache(_get_viscodesfmri)(rids)) vcodes, vcodes2 = viscodes[:, 0], viscodes[:, 1] return Bunch(func=func_files, dx_group=dx_group, exam_codes=vcodes, exam_dates=exam_dates, exam_codes2=vcodes2, motion=motions, subjects=subjects, images=images)
def construct_and_attach_filename_data(self): synsets = self.synset_list num_per_synset = self.data['num_per_synset'] seed = self.data['seed'] folder = self.local_home('PrecomputedDicts') mem = Memory(folder) compute_filename_dict = mem.cache(self.compute_filename_dict) filenames, filenames_dict = compute_filename_dict(synsets, num_per_synset, seed) self.filenames_dict = filenames_dict
def add_caching_to_funcs(obj, funcNames): mem = Memory('../.add_caching_to_funcs', verbose=11) if obj is None or funcNames is None: return if isScalar(funcNames): funcNames = [funcNames] for name in funcNames: func = getattr(obj, name, None) if func is not None: setattr(obj, name, mem.cache(func))
def __init__(self): self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home('cache'), verbose=False) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def _run_suject_level1_glm(subject_data_dir, subject_output_dir, **kwargs): """ Just another wrapper. """ mem = Memory(os.path.join(subject_output_dir, "cache_dir")) return mem.cache(run_suject_level1_glm)(subject_data_dir, subject_output_dir, **kwargs)
def __init__(self, meta=None): if meta is not None: self._meta = meta self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home('cache')) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def main(): ## subsdir=r'E:\elan projects\L2\submissions\extracted' ## dstdir=os.path.join(subsdir,r'passed') ## copypassedfiles(dstdir,subsdir) dstdir=r'E:\elan projects\L2\resubmission\full' import glob jsonflist=glob.glob(dstdir+'\\'+r'*.379.json') mem = Memory(cachedir=dstdir) json2agreementmatrix_cached=mem.cache(json2agreementmatrix) c=json2agreementmatrix_cached(jsonflist,task_type='all') print c
def __init__(self, use_cache=True, cachedir=None): """Inits TpsSolverFactory Args: use_cache: whether to cache solver matrices in file cache_dir: cached directory. if not specified, the .cache directory in parent directory of top-level package is used. """ if use_cache: if cachedir is None: # .cache directory in parent directory of top-level package cachedir = os.path.join(__import__(__name__.split('.')[0]).__path__[0], os.path.pardir, ".cache") memory = Memory(cachedir=cachedir, verbose=0) self.get_solver_mats = memory.cache(self.get_solver_mats)
def _niigz2nii(self): """ Convert .nii.gz to .nii (crucial for SPM). """ cache_dir = os.path.join(self.output_dir, 'cache_dir') mem = Memory(cache_dir, verbose=100) self.func = mem.cache(do_niigz2nii)(self.func, output_dir=self.output_dir) if not self.anat is None: self.anat = mem.cache(do_niigz2nii)(self.anat, output_dir=self.output_dir)
def load_adni_longitudinal_hippocampus_volume(): """ Returns longitudinal hippocampus measures """ BASE_DIR = _get_data_base_dir('ADNI_csv') roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv')) dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv')) fs = pd.read_csv(os.path.join(BASE_DIR, 'UCSFFSX51_05_20_15.csv')) # extract hippocampus numerical values column_idx = np.arange(131, 147) cols = ['ST' + str(c) + 'HS' for c in column_idx] hipp = fs[cols].values idx_num = np.array([~np.isnan(h).all() for h in hipp]) hipp = hipp[idx_num, :] # extract roster id rids = fs['RID'].values[idx_num] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) # get subject id def _getptidshippo(rids): return [_rid_to_ptid(rid, roster) for rid in rids] ptids = memory.cache(_getptidshippo)(rids) # extract exam date exams = fs['EXAMDATE'].values[idx_num] vcodes = fs['VISCODE'].values[idx_num] vcodes2 = fs['VISCODE2'].values[idx_num] exams = list(map( lambda e: date(int(e[:4]), int(e[5:7]), int(e[8:])), exams)) exams = np.array(exams) # extract diagnosis def _getdxhippo(rids, exams): return np.array(list(map(_get_dx, rids, [dx]*len(rids), exams))) dx_ind = memory.cache(_getdxhippo)(rids, exams) dx_group = DX_LIST[dx_ind] return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids), hipp=np.array(hipp), exam_dates=np.array(exams), exam_codes=np.array(vcodes), exam_codes2=np.array(vcodes2))
def _load_data(root_dir="/", data_set="ds107", cache_dir="/volatile/storage/workspace/parietal_retreat/" + "covariance_learn/cache/", n_jobs=1): from joblib import Memory mem = Memory(cachedir=cache_dir) load_data_ = mem.cache(setup_data_paths.run) df = setup_data_paths.get_all_paths(root_dir=root_dir, data_set=data_set) # region_signals = joblib.load(os.path.join(root_dir, dump_file)) region_signals = load_data_(root_dir=root_dir, data_set=data_set, n_jobs=n_jobs, dump_dir=os.path.join(cache_dir, data_set)) return df, region_signals
def __init__(self, data_same, normalize=True, min_max_scale=False, scale_f1=None, scale_f2=None, nframes=1, batch_size=1, marginf=0, only_same=False, cache_to_disk=False): self.print_mean_DTW_costs(data_same) self.ratio_same = 0.5 # init self.ratio_same = self.compute_ratio_speakers(data_same) self._nframes = nframes print "nframes:", self._nframes (self._x1, self._x2, self._y_word, self._y_spkr, self._scale_f1, self._scale_f2) = self.prep_data(data_same, normalize, min_max_scale, scale_f1, scale_f2) self._y1 = [numpy.zeros(x.shape[0], dtype='int8') for x in self._x1] self._y2 = [numpy.zeros(x.shape[0], dtype='int8') for x in self._x1] # self._y1 says if frames in x1 and x2 belong to the same (1) word or not (0) # self._y2 says if frames in x1 and x2 were said by the same (1) speaker or not(0) for ii, yy in enumerate(self._y_word): self._y1[ii][:] = yy for ii, yy in enumerate(self._y_spkr): self._y2[ii][:] = yy self._nwords = batch_size self._margin = marginf # marginf says if we pad taking a number of frames as margin self._x1_mem = [] self._x2_mem = [] self._y1_mem = [] self._y2_mem = [] self.cache_to_disk = cache_to_disk if self.cache_to_disk: from joblib import Memory self.mem = Memory(cachedir='joblib_cache', verbose=0)
def ica_step(group_maps, group_variance, cachedir=None): memory = Memory(cachedir=cachedir, mmap_mode='r') # We do a spatial ICA: the arrays are transposed in the following, # axis1 = component, and axis2 is voxel number. _, ica_maps = memory.cache(fastica)(group_maps.T, whiten=False) # Project the ICAs on the group maps to give a 'cross-subject # reproducibility' score. proj = np.dot(ica_maps, group_maps) reproducibility_score = (np.abs(proj)*group_variance).sum(axis=-1) order = np.argsort(reproducibility_score)[::-1] ica_maps = ica_maps[order, :] return ica_maps.T
def __init__(self, caching=False): """Create a new CompatIdFetcher object. Args: caching: Whether to cache setup from run to run. See PrebuiltCompatibilityTest.CACHING for details. """ self.compat_ids = None if caching: # This import occurs here rather than at the top of the file because we # don't want to force developers to install joblib. The caching argument # is only set to True if PrebuiltCompatibilityTest.CACHING is hand-edited # (for testing purposes). # pylint: disable=import-error from joblib import Memory memory = Memory(cachedir=tempfile.gettempdir(), verbose=0) self.FetchCompatIds = memory.cache(self.FetchCompatIds)
def load_adni_longitudinal_csf_biomarker(): """ Returns longitudinal csf measures """ BASE_DIR = _get_data_base_dir('ADNI_csv') roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv')) dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv')) csf_files = ['UPENNBIOMK.csv', 'UPENNBIOMK2.csv', 'UPENNBIOMK3.csv', 'UPENNBIOMK4_09_06_12.csv', 'UPENNBIOMK5_10_31_13.csv', 'UPENNBIOMK6_07_02_13.csv', 'UPENNBIOMK7.csv', 'UPENNBIOMK8.csv'] cols = ['RID', 'VISCODE', 'ABETA', 'PTAU', 'TAU'] # 3,4,5,7,8 csf = pd.DataFrame() for csf_file in csf_files[2:]: fs = pd.read_csv(os.path.join(BASE_DIR, csf_file)) csf = csf.append(fs[cols]) # remove nans from csf values biom = csf[cols[2:]].values idx = np.array([~np.isnan(v).any() for v in biom]) biom = biom[idx] # get phenotype vcodes = csf['VISCODE'].values[idx] rids = csf['RID'].values[idx] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) def _getptidscsf(rids): return list(map(lambda x: _rid_to_ptid(x, roster), rids)) ptids = memory.cache(_getptidscsf)(rids) # get diagnosis def _getdxcsf(rids, vcodes): return list(map(lambda x, y: DX_LIST[_get_dx(x, dx, viscode=y)], rids, vcodes)) dx_group = memory.cache(_getdxcsf)(rids, vcodes) return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids), csf=np.array(biom), exam_codes=np.array(vcodes), exam_codes2=np.array(vcodes))
def __init__(self, meta=None, seed=0, ntrain=15, ntest=15, num_splits=10): self.seed = seed self.ntrain = ntrain self.ntest = ntest self.num_splits = num_splits if meta is not None: self._meta = meta self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home('cache')) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def __init__(self, *args, **kwargs): level = kwargs.pop('level', 10) # Initialize the memory object self.memory = Memory(*args, **kwargs) # The level parameter controls which data we cache # smaller numbers mean less caching self.level = level
def __init__(self, root, filter_species_ids=None, required_attributes=None, transform=None, is_training=False, cachedir=CACHE_DIR): super(GogglesDataset, self).__init__() mem = Memory(cachedir) metadata_loader = mem.cache(self._load_metadata) self.is_training = is_training self._data_dir = root required_species, \ self.attributes, \ self._image_data = metadata_loader(root) # _load_metadata(root) cached if filter_species_ids is not None: assert type(filter_species_ids) is list filter_species_ids = set(filter_species_ids) required_species = list(filter(lambda s: s.id in filter_species_ids, required_species)) self._image_data = list(filter(lambda d: d.species.id in filter_species_ids, self._image_data)) self._species_labels = {species: label for label, species in enumerate(required_species)} if is_training is not None: self._image_data = list(filter( lambda d: d.is_for_training == is_training, self._image_data)) if required_attributes is not None: assert type(required_attributes) is list self.attributes = required_attributes elif filter_species_ids is not None: attributes = set() for species in required_species: attributes = attributes.union(species.attributes) self.attributes = list(sorted(attributes, key=lambda a: a.id)) self.num_attributes = len(self.attributes) if transform is not None: self._transform = transform else: self._transform = transforms.Compose([transforms.ToTensor()])
def __init__(self, systemConfig, **kwargs): if systemConfig.get('cache', False): try: from tempfile import mkdtemp from joblib import Memory except ImportError: pass else: if 'cacheDir' in systemConfig: cacheDir = systemConfig['cacheDir'] try: os.makedirs(cacheDir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cacheDir): pass else: raise else: cacheDir = mkdtemp() self._mem = Memory(cachedir=cacheDir, verbose=0) # Cache outputs of these methods self.forward = self._mem.cache(self.forward) self.backprop = self._mem.cache(self.backprop) hx = [(systemConfig['dx'], systemConfig['nx']-1)] hz = [(systemConfig['dz'], systemConfig['nz']-1)] self.mesh = SimPEG.Mesh.TensorMesh([hx, hz], '00') self.mesh.ireg = systemConfig.get('ireg', DEFAULT_IREG) self.mesh.freeSurf = systemConfig.get('freeSurf', DEFAULT_FREESURF_BOUNDS) initMap = { # Argument Rename to Property 'c': 'cR', 'Q': None, 'rho': None, 'nPML': None, 'freeSurf': None, 'freq': None, 'ky': None, 'kyweight': None, 'Solver': None, 'dx': None, 'dz': None, 'dtype': None, } for key in initMap.keys(): if key in systemConfig: if initMap[key] is None: setattr(self, key, systemConfig[key]) else: setattr(self, initMap[key], systemConfig[key])
def load_adni_longitudinal_mmse_score(): """ Returns longitudinal mmse scores """ BASE_DIR = _get_data_base_dir('ADNI_csv') roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv')) dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv')) fs = pd.read_csv(os.path.join(BASE_DIR, 'MMSE.csv')) # extract nans free mmse mmse = fs['MMSCORE'].values idx_num = fs['MMSCORE'].notnull().values mmse = mmse[idx_num] # extract roster id rids = fs['RID'].values[idx_num] # caching dataframe extraction functions CACHE_DIR = _get_cache_base_dir() cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache') if not os.path.isdir(cache_dir): os.makedirs(cache_dir) memory = Memory(cachedir=cache_dir, verbose=0) def _getptidsmmse(rids): return [_rid_to_ptid(rid, roster) for rid in rids] # get subject id ptids = memory.cache(_getptidsmmse)(rids) # extract visit code (don't use EXAMDATE ; null for GO/2) vcodes = fs['VISCODE'].values vcodes = vcodes[idx_num] vcodes2 = fs['VISCODE2'].values vcodes2 = vcodes2[idx_num] def _getdxmmse(rids, vcodes2): return list(map( lambda x, y: DX_LIST[_get_dx(x, dx, viscode=y)], rids, vcodes2)) # get diagnosis dx_group = memory.cache(_getdxmmse)(rids, vcodes2) return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids), mmse=mmse, exam_codes=vcodes, exam_codes2=vcodes2)
def compute_confidence_par(allLearners, dada): lab_confidence = np.zeros([dada.shape[0], len(allLearners)]) tic = time.time() #import ipdb;ipdb.set_trace() print 'producing weighted outputs IN PARALLEL' mem = Memory(cachedir='tmp') classif_RBF2 = mem.cache(confidence_par) c = l_c[0] r = Parallel(n_jobs=N_JOBS)(delayed(confidence_par)(allLearners,ii,dada) for ii in enumerate(allLearners)) res, iis = zip(*r) for t,y in enumerate(iis): lab_confidence[:,y] = res[t] print "time taken to produce confidence:", round(time.time() - tic,2), "seconds" #import ipdb;ipdb.set_trace() return lab_confidence
def test_cached(self): try: from joblib import Memory mem = Memory(self.cache_dir) dep_tree = { 'a': 5, 'b': 6, 'c': mem.cache(slow_func), } data = Pipeline(dep_tree) t0 = time.time() data.resolve() delta = time.time() - t0 t0 = time.time() data.resolve() delta = time.time() - t0 assert delta < .1 except: pass
def __init__(self, meta=None, seed=0, ntrain=10, ntest=10, num_splits=5): self.seed = seed self.ntrain = ntrain self.ntest = ntest self.num_splits = num_splits self.names = ["Face", "Body", "Object"] if meta is not None: self._meta = meta self.name = self.__class__.__name__ try: from joblib import Memory mem = Memory(cachedir=self.home("cache")) self._get_meta = mem.cache(self._get_meta) except ImportError: pass
def getagreement(tpl,datadir,task_type='all'): """Get agreement values for annotators in the :data:'tpl' list Args: tpl (list): combination group of annotators datadir (str): Cache data directory used by joblib Returns: namedtuple defined as ``Agree = collections.namedtuple('Agree', ['kappa', 'alpha','avg_ao'], verbose=True)`` """ mem = Memory(cachedir=datadir) readjson=mem.cache(json2taskdata.readjson,mmap_mode='r') create_task_data= mem.cache(json2taskdata.create_task_data) count_occurrances=mem.cache(json2taskdata.count_occurrances) count_labels=mem.cache(json2taskdata.count_labels) annotators=set() lectask=[] #------------------------------------------------------------------------------- # for each annotator in group tpl #------------------------------------------------------------------------------- for stditem in tpl: aname=stditem.split('.')[0][3:][-2:] annotators.add(aname) lecdict=readjson(stditem) newlectask= create_task_data(lecdict,task_type=task_type,annotator=aname) label_data=json2taskdata.create_labels_list(newlectask) abscount=count_occurrances(str(label_data)) yaml.dump(abscount,open(os.path.join( datadir,'abscount-'+aname+'.yaml'),'w')) setcount=count_labels(newlectask) yaml.dump(setcount,open(os.path.join( datadir,'setcount-'+aname+'.yaml'),'w')) lectask=lectask+newlectask task=AnnotationTask(data=lectask,distance=nltk.metrics.distance.masi_distance_mod) return {frozenset(annotators): Agree(task.kappa(),task.alpha(),task.avg_Ao())}
def __init__(self, *args, **kwargs): from tempfile import mkdtemp from joblib import Memory self.cachedir = cachedir or mkdtemp() self.memory = Memory(cachedir=self.cachedir) for method in self.cached_methods: setattr(self, method, self.memory.cache(getattr(self, method))) if not os.path.isdir(self.cachedir): raise OSError("Non-existent directory: ", self.cachedir) super(_DiskCache, self).__init__(*args, **kwargs)
def __init__( self, segmentation=None, duration=1., step=0.1, gap=0., threshold=0., feature=None, cache=False ): super(SpeechTurnSegmentation, self).__init__() if segmentation is None: self.segmentation = SegmentationGaussianDivergence( duration=duration, step=step, gap=gap, threshold=threshold ) else: self.segmentation = segmentation # default features for segmentation # are MFCC (energy + 12 coefficients) if feature is None: from pyannote.feature.yaafe import YaafeMFCC feature = YaafeMFCC( e=True, De=False, DDe=False, coefs=12, D=False, DD=False ) self.feature = feature if cache: # initialize cache from joblib import Memory from tempfile import mkdtemp memory = Memory(cachedir=mkdtemp(), verbose=0) # cache feature extraction method self.get_features = memory.cache(self.get_features)
def _set_memory(self, cache_dir): # Try importing joblib. try: from joblib import Memory self._memory = Memory(cachedir=self.cache_dir, mmap_mode=None, verbose=self.verbose, ) logger.debug("Initialize joblib cache dir at `%s`.", self.cache_dir) except ImportError: # pragma: no cover logger.warn("Joblib is not installed. " "Install it with `conda install joblib`.") self._memory = None
import os import glob import re import numpy as np import pandas as pd import logging from joblib import Memory import hdbscan from sklearn.preprocessing import StandardScaler memory = Memory(os.path.join(os.path.dirname(__file__), ".cache")) def load(paths): """ Load the CSV file at the provided path, and return a pandas DataFrame. """ if not isinstance(paths, list): paths = [paths] df = pd.DataFrame() for path in paths: new_df = pd.read_csv(path, delimiter='|') df = pd.concat([df, new_df]) df = df.reset_index() return df def get_number_columns(df):
import json from ambiverse_apikey import client_id, client_secret from ambiverse_token import get_token from joblib import Memory import os import logging import sys import time import datetime from get_category import query_category logging.basicConfig(level=logging.INFO, stream=sys.stdout) cachedir = "./temp" if not os.path.exists(cachedir): os.mkdir(cachedir) memory = Memory(cachedir=cachedir, verbose=0) @memory.cache def ambiverse(item, tool_name): text = item["text"] #.encode('utf-8') dpaId = item["dpaId"] ambiverse_token = get_token(client_id, client_secret) ambiverse_request_url = "https://api.ambiverse.com/v1/entitylinking/analyze" text_string = json.dumps({"text": text}) payload = text_string headers = { 'content-type': "application/json", 'accept': "application/json", 'authorization': ambiverse_token }
import autograd.numpy as np from autograd import grad import math from matplotlib import pyplot as plt from joblib import Memory from generate_trajectory import generate_trajectory, generate_observations memory = Memory(cachedir='joblib_cache', verbose=0) #import seaborn as sns ''' @changing: to change this for a new experiment, modify the following: simulator() generate_variational() data() inputs to iterate() ''' T = 50 Gamma = 0.1 C = 1 Sigma = 0.005 startState = 1e-3 def iterate(params, sim_variables, u1, u2, u3, m, v): ''' @param params: variational distribution parameters @param prior_params: prior parameters @sim_variables: simulator variables variables @param u1: for reparametrizing variational distribution @param u2: for reparametrizing simulator @param u3: for reparametrizing KL divergence @param m, v: for Adam
# Finding studies with similar activations # ======================================== ###################################################################### # Transform the coordinates into brain maps # ----------------------------------------- # Here we generate brain maps for all the studies in the NeuroQuery dataset, # using the activation coordinates. This takes a long time(around 15mn) so we # cache the result. corpus_metadata = encoder.corpus_info["metadata"].set_index("pmid") coordinates = pd.read_csv(datasets.fetch_peak_coordinates()) # We cache the `coordinates_to_maps` function with joblib to avoid recomputing # this if we train a new model. coord_to_maps = Memory(str(cache_directory)).cache(coordinates_to_maps) # You can set target_affine to a different value to increase image resolution # or reduce computation time. The model on neuroquery.saclay.inria.fr uses 4 mm # resolution i.e. target_affine=(4, 4, 4) # You can also adjust the smoothing by setting `fwhm` (Full Width at Half # maximum) brain_maps, masker = coord_to_maps(coordinates, target_affine=(6, 6, 6), fwhm=9.0) brain_maps = brain_maps[(brain_maps.values != 0).any(axis=1)] brain_maps /= np.sum(brain_maps.values, axis=1, keepdims=True) ###################################################################### # Find studies with activations similar to the input maps # -------------------------------------------------------
import mne import numpy as np import matplotlib import pylab as plt from glob import glob from conf_analysis.behavior import metadata from conf_analysis.meg import preprocessing, localizer, lcmv, srplots from conf_analysis.meg import source_recon as sr from joblib import Memory from functools import reduce memory = Memory(cachedir=metadata.cachedir) def make_overview_figures(subjects, bem='three_layer', prefix=''): from conf_analysis.meg import srplots for sub in subjects: avg, idx, F = srplots.single_sub_contrast_indices(sub) print('Subject:', sub, 'F:', F) gamma_overview(sub, F=F, bem=bem, prefix=prefix + 'F%f' % F) stats_overview(sub, F=F, prefix=prefix + 'F%f' % F) def gamma_overview(subject, F=45, bem='three_layer', prefix=''): ''' Prepare data for an overview figure that shows source recon'ed activity. ''' plt.figure(figsize=(15, 15)) gs = matplotlib.gridspec.GridSpec(2 * 4, 6)
# coding: utf-8 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED from xgboost.compat import DASK_INSTALLED from hypothesis import strategies from hypothesis.extra.numpy import arrays from joblib import Memory from sklearn import datasets import xgboost as xgb import numpy as np try: import cupy as cp except ImportError: cp = None memory = Memory('./cachedir', verbose=0) def no_sklearn(): return { 'condition': not SKLEARN_INSTALLED, 'reason': 'Scikit-Learn is not installed' } def no_dask(): return {'condition': not DASK_INSTALLED, 'reason': 'Dask is not installed'} def no_pandas(): return {
import os import numpy as np import matplotlib.pyplot as plt from joblib import Memory import paths from ..utils.files import listFilesInDir, ensure_dir_exists from pamap_common import * # noqa memory = Memory('./') join = os.path.join # ================================================================ # consts MISSING_DATA_VALUE = np.nan OPTIONAL_DIR = join(paths.PAMAP2, 'Optional') PROTOCOL_DIR = join(paths.PAMAP2, 'Protocol') FIG_SAVE_DIR = join('figs', 'pamap2') SAVE_DIR_LINE_GRAPH = join(FIG_SAVE_DIR, 'line') SAVE_DIR_IMG = join(FIG_SAVE_DIR, 'img') ACTIVITY_IDS_2_NAMES = { 0: NAME_OTHER, 1: NAME_LYING, 2: NAME_SITTING, 3: NAME_STANDING, 4: NAME_WALK, 5: NAME_RUN, 6: NAME_CYCLE,
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
from nilearn import datasets msdl_atlas_dataset = datasets.fetch_atlas_msdl() rest_dataset = datasets.fetch_development_fmri(n_subjects=n_subjects) # print basic information on the dataset print('First subject functional nifti image (4D) is at: %s' % rest_dataset.func[0]) # 4D data ############################################################################## # Extracting region signals # -------------------------- from nilearn import input_data # A "memory" to avoid recomputation from joblib import Memory mem = Memory('nilearn_cache') masker = input_data.NiftiMapsMasker(msdl_atlas_dataset.maps, resampling_target="maps", detrend=True, high_variance_confounds=True, low_pass=None, high_pass=0.01, t_r=2, standardize=True, memory='nilearn_cache', memory_level=1, verbose=2) masker.fit() subject_time_series = []
import logging import pprint import os from tempfile import gettempdir from tabulate import tabulate from copy import copy, deepcopy from urllib.parse import urlparse, parse_qs from nidm.experiment import Navigate from nidm.experiment.Utils import validate_uuid from numpy import std, mean, median import functools import operator from joblib import Memory memory = Memory(gettempdir(), verbose=0) USE_JOBLIB_CACHE = False import simplejson def convertListtoDict(lst): ''' This function converts a list to a dictionary :param lst: list to convert :return: dictionary ''' res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)} return res_dct
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0, metric='minkowski', p=2, leaf_size=40, algorithm='best', memory=Memory(cachedir=None, verbose=0), approx_min_span_tree=True, gen_min_span_tree=False, core_dist_n_jobs=4, cluster_selection_method='eom', allow_single_cluster=False, match_reference_implementation=False, **kwargs): """Perform HDBSCAN clustering from a vector array or distance matrix. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. min_cluster_size : int, optional (default=5) The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise. min_samples : int, optional (default=None) The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. defaults to the min_cluster_size. cluster_selection_epsilon: float, optional (default=0.0) A distance threshold. Clusters below this value will be merged. See [3]_ for more information. alpha : float, optional (default=1.0) A distance scaling parameter as used in robust single linkage. See [2]_ for more information. metric : string or callable, optional (default='minkowski') The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. p : int, optional (default=2) p value to use if using the minkowski metric. leaf_size : int, optional (default=40) Leaf size for trees responsible for fast nearest neighbour queries. algorithm : string, optional (default='best') Exactly which algorithm to use; hdbscan has variants specialised for different characteristics of the data. By default this is set to ``best`` which chooses the "best" algorithm given the nature of the data. You can force other options if you believe you know better. Options are: * ``best`` * ``generic`` * ``prims_kdtree`` * ``prims_balltree`` * ``boruvka_kdtree`` * ``boruvka_balltree`` memory : instance of joblib.Memory or string, optional Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory. approx_min_span_tree : bool, optional (default=True) Whether to accept an only approximate minimum spanning tree. For some algorithms this can provide a significant speedup, but the resulting clustering may be of marginally lower quality. If you are willing to sacrifice speed for correctness you may want to explore this; in general this should be left at the default True. gen_min_span_tree : bool, optional (default=False) Whether to generate the minimum spanning tree for later analysis. core_dist_n_jobs : int, optional (default=4) Number of parallel jobs to run in core distance computations (if supported by the specific algorithm). For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. cluster_selection_method : string, optional (default='eom') The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree -- this provides the most fine grained and homogeneous clusters. Options are: * ``eom`` * ``leaf`` allow_single_cluster : bool, optional (default=False) By default HDBSCAN* will not produce a single cluster, setting this to t=True will override this and allow single cluster results in the case that you feel this is a valid result for your dataset. (default False) match_reference_implementation : bool, optional (default=False) There exist some interpretational differences between this HDBSCAN* implementation and the original authors reference implementation in Java. This can result in very minor differences in clustering results. Setting this flag to True will, at a some performance cost, ensure that the clustering results match the reference implementation. **kwargs : optional Arguments passed to the distance metric Returns ------- labels : ndarray, shape (n_samples, ) Cluster labels for each point. Noisy samples are given the label -1. probabilities : ndarray, shape (n_samples, ) Cluster membership strengths for each point. Noisy samples are assigned 0. cluster_persistence : array, shape (n_clusters, ) A score of how persistent each cluster is. A score of 1.0 represents a perfectly stable cluster that persists over all distance scales, while a score of 0.0 represents a perfectly ephemeral cluster. These scores can be guage the relative coherence of the clusters output by the algorithm. condensed_tree : record array The condensed cluster hierarchy used to generate clusters. single_linkage_tree : ndarray, shape (n_samples - 1, 4) The single linkage tree produced during clustering in scipy hierarchical clustering format (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). min_spanning_tree : ndarray, shape (n_samples - 1, 3) The minimum spanning as an edgelist. If gen_min_span_tree was False this will be None. References ---------- .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). Density-based clustering based on hierarchical density estimates. In Pacific-Asia Conference on Knowledge Discovery and Data Mining (pp. 160-172). Springer Berlin Heidelberg. .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the cluster tree. In Advances in Neural Information Processing Systems (pp. 343-351). .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical Density-based Cluster Selection. arxiv preprint 1911.02282. """ if min_samples is None: min_samples = min_cluster_size if type(min_samples) is not int or type(min_cluster_size) is not int: raise ValueError('Min samples and min cluster size must be integers!') if min_samples <= 0 or min_cluster_size <= 0: raise ValueError('Min samples and Min cluster size must be positive' ' integers') # if min_cluster_size == 1: # raise ValueError('Min cluster size must be greater than one') if type(cluster_selection_epsilon) is int: cluster_selection_epsilon = float(cluster_selection_epsilon) if type(cluster_selection_epsilon ) is not float or cluster_selection_epsilon < 0.0: raise ValueError( 'Epsilon must be a float value greater than or equal to 0!') if not isinstance(alpha, float) or alpha <= 0.0: raise ValueError('Alpha must be a positive float value greater than' ' 0!') if leaf_size < 1: raise ValueError('Leaf size must be greater than 0!') if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not' ' defined!') if match_reference_implementation: min_samples = min_samples - 1 min_cluster_size = min_cluster_size + 1 approx_min_span_tree = False if cluster_selection_method not in ('eom', 'leaf'): raise ValueError('Invalid Cluster Selection Method: %s\n' 'Should be one of: "eom", "leaf"\n') # Checks input and converts to an nd-array where possible if metric != 'precomputed' or issparse(X): X = check_array(X, accept_sparse='csr') else: # Only non-sparse, precomputed distance matrices are handled here # and thereby allowed to contain numpy.inf for missing distances check_precomputed_distance_matrix(X) # Python 2 and 3 compliant string_type checking if isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) size = X.shape[0] min_samples = min(size - 1, min_samples) if min_samples == 0: min_samples = 1 if algorithm != 'best': if metric != 'precomputed' and issparse(X) and metric != 'generic': raise ValueError( "Sparse data matrices only support algorithm 'generic'.") if algorithm == 'generic': (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_generic)( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'prims_kdtree': if metric not in KDTree.valid_metrics: raise ValueError("Cannot use Prim's with KDTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_prims_kdtree)( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'prims_balltree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Prim's with BallTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_prims_balltree)( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif algorithm == 'boruvka_kdtree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with KDTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_boruvka_kdtree)( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) elif algorithm == 'boruvka_balltree': if metric not in BallTree.valid_metrics: raise ValueError("Cannot use Boruvka with BallTree for this" " metric!") (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_boruvka_balltree)( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) else: raise TypeError('Unknown algorithm type %s specified' % algorithm) else: if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ... (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_generic)( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) elif metric in KDTree.valid_metrics: # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_prims_kdtree)( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) else: (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_boruvka_kdtree)( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) else: # Metric is a valid BallTree metric # TO DO: Need heuristic to decide when to go to boruvka; # still debugging for now if X.shape[1] > 60: (single_linkage_tree, result_min_span_tree) = memory.cache(_hdbscan_prims_balltree)( X, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, **kwargs) else: (single_linkage_tree, result_min_span_tree ) = memory.cache(_hdbscan_boruvka_balltree)( X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) return _tree_to_labels(X, single_linkage_tree, min_cluster_size, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon) + \ (result_min_span_tree,)
import matplotlib.pyplot as plt from joblib import Memory import pandas from sklearn.utils.testing import ignore_warnings from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition.nmf import NMF from sklearn.decomposition.nmf import _initialize_nmf from sklearn.decomposition.nmf import _beta_divergence from sklearn.decomposition.nmf import _check_init from sklearn.exceptions import ConvergenceWarning from sklearn.utils.extmath import safe_sparse_dot, squared_norm from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted, check_non_negative mem = Memory(cachedir='.', verbose=0) ################### # Start of _PGNMF # ################### # This class implements a projected gradient solver for the NMF. # The projected gradient solver was removed from scikit-learn in version 0.19, # and a simplified copy is used here for comparison purpose only. # It is not tested, and it may change or disappear without notice. def _norm(x): """Dot product-based Euclidean norm implementation See: http://fseoane.net/blog/2011/computing-the-vector-norm/ """ return np.sqrt(squared_norm(x))
def pipeline(name_threshold: int, email_threshold: int, data_loc: str, lower_names: bool = True, lower_emails: bool = True, use_committer: bool = False, cache_loc: str = None, use_precalculated_popular: bool = True, debug_output=None): if cache_loc: memory = Memory(cache_loc, verbose=0) read_names_emails_gids_cache = memory.cache(read_names_emails_gids) else: read_names_emails_gids_cache = read_names_emails_gids names, emails, github_ids, repositories = read_names_emails_gids_cache( data_loc=data_loc, use_committer=use_committer) print("Input is ready! Number of samples", len(names)) # prepare preprocessing function for names and emails preproces_emails = get_preprocessing(lower_emails) preproces_names = get_preprocessing(lower_names) names = list(map(preproces_names, names)) emails = list(map(preproces_emails, emails)) if use_precalculated_popular: from idmatching.blacklist import POPULAR_NAMES, POPULAR_EMAILS popular_names = POPULAR_NAMES popular_emails = POPULAR_EMAILS else: # collect popular names and emails popular_names = CooccurrenceFiltering( threshold=name_threshold, threshold_comp=">=", is_ignored_key=is_ignored_name, is_ignored_value=is_ignored_email).fit(names, emails).popular_keys popular_emails = CooccurrenceFiltering( threshold=email_threshold, threshold_comp=">=", is_ignored_key=is_ignored_email, is_ignored_value=is_ignored_name).fit(emails, names).popular_keys print("Number of popular names to ignore", len(popular_names)) print("Number of popular emails to ignore", len(popular_emails)) # prepare filtering functions is_ignored_popular_name = prepare_is_blacklisted_function( black_list=popular_names, preprocess_value=preproces_names) is_ignored_popular_email = prepare_is_blacklisted_function( black_list=popular_emails, preprocess_value=preproces_emails) # replace popular names with (name, repository) pair for i, (name, repository) in enumerate(zip(names, repositories)): if is_ignored_popular_name(name): names[i] = "(%s, %s)" % (name, repository) popular_names = CooccurrenceFiltering( threshold=name_threshold, threshold_comp=">=", is_ignored_key=is_ignored_name, is_ignored_value=is_ignored_email).fit(names, emails).popular_keys # print("Number of popular names to ignore after replacement", len(popular_names)) # is_ignored_popular_name = prepare_is_blacklisted_function(black_list=popular_names, # preprocess_value=preproces_names) def is_ignored_popular_name(*args): return False # identity matching raw_persons = [] for name, email in tqdm(zip(names, emails), total=len(names)): raw_persons.append( RawPerson(name=preproces_names(name), email=preproces_emails(email))) identity2person = identity_matching_pipeline( raw_persons=raw_persons, is_ignored_name=is_ignored_name, is_ignored_email=is_ignored_email, is_popular_name=is_ignored_popular_name, is_popular_email=is_ignored_popular_email) # save result identity2person_to_save = sorted( "%s||%s\n" % ("|".join(sorted(person.names)), "|".join(sorted(person.emails))) for person in identity2person.values()) if debug_output: with open(debug_output, "w") as f: f.writelines(identity2person_to_save) # evaluation # predicted name_emails2id = {} for k, v in tqdm(identity2person.items(), total=len(identity2person)): for em in v.emails: name_emails2id[em] = k for n in v.names: name_emails2id[n] = k # ground truth email_name2gid = {} gid2email_name = defaultdict(set) for name, email, gid in tqdm(zip(names, emails, github_ids), total=len(names)): name, email = preproces_names(name), preproces_emails(email) if not is_ignored_name(name) and not is_ignored_email(email): email_name2gid[name] = gid email_name2gid[email] = gid gid2email_name[gid].add(name) gid2email_name[gid].add(email) # measure quality per sample prec = [] rec = [] f1 = [] cc_size = [] for person_names_emails in tqdm(gid2email_name.values(), total=len(gid2email_name)): pred_id = set() for ent in person_names_emails: pred_id.add(name_emails2id[ent]) for pid in pred_id: intersection = 0 for ent in person_names_emails: if ent in identity2person[ pid].emails or ent in identity2person[pid].names: intersection += 1 rec.append(intersection / len(person_names_emails)) prec.append(intersection / (len(identity2person[pid].emails) + len(identity2person[pid].names))) if prec[-1] == 0 and rec[-1] == 0: f1.append(0) else: f1.append(2 * prec[-1] * rec[-1] / (prec[-1] + rec[-1])) cc_size.append( len(identity2person[pid].emails) + len(identity2person[pid].names)) def avr(x): return sum(x) / len(x) avr_prec, avr_rec, avr_f1 = avr(prec), avr(rec), avr(f1) print("Precision %s, recall %s, f1 %s" % (avr_prec, avr_rec, avr_f1)) def wavr(x, w): return sum(x_ * w_ for x_, w_ in zip(x, w)) / sum(w) wavr_prec, wavr_rec, wavr_f1 = wavr(prec, cc_size), wavr(rec, cc_size), wavr( f1, cc_size) print("Precision %s, recall %s, f1 %s" % (wavr_prec, wavr_rec, wavr_f1)) return avr_prec, avr_rec, avr_f1, wavr_prec, wavr_rec, wavr_f1, identity2person, \ gid2email_name, raw_persons
import os from joblib import Memory from lint_analysis.bin_counts.models import BinCount cache_dir = os.path.join(os.path.dirname(__file__), 'cache') memory = Memory(cache_dir) token_counts = memory.cache(BinCount.token_counts) token_pos_counts = memory.cache(BinCount.token_pos_counts) pos_series = memory.cache(BinCount.pos_series) token_series = memory.cache(BinCount.token_series)
# DBSCAN - density based algorithm # Performance: near optimal f7 = plt.figure(7) plot_clusters(data, cluster.DBSCAN, (), {'eps': 0.25}) # HDBSCAN - density based algorithm that allows for varying density # Performance: optimal # Add desaturation for points with lower probability of belonging # to a cluster. f8 = plt.figure(8) clusterer = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, memory=Memory(cachedir=None), metric='euclidean', min_cluster_size=6, min_samples=None, p=None, cluster_selection_method='eom') clusterer = clusterer.fit(data) start_time = time.time() end_time = time.time() palette = sns.color_palette('deep') cluster_colors = [ sns.desaturate(palette[col], sat) if col >= 0 else (0.5, 0.5, 0.5) for col, sat in zip(clusterer.labels_, clusterer.probabilities_) ] plt.scatter(data.T[0], data.T[1], c=cluster_colors, **plot_kwds)
Created on March 28, 2018 @author: Alejandro Molina ''' import numpy as np from joblib import Memory from spn.algorithms.Inference import likelihood, histogram_likelihood from spn.algorithms.StructureLearning import learn_structure from spn.algorithms.splitting.Clustering import get_split_rows_KMeans from spn.algorithms.splitting.RDC import get_split_cols_RDC from spn.gpu.TensorFlow import eval_tf from spn.structure.Base import Context from spn.structure.leaves.Histograms import create_histogram_leaf, add_domains memory = Memory(cachedir="cache", verbose=0, compress=9) @memory.cache def learn(data, ds_context): spn = learn_structure(data, ds_context, get_split_rows_KMeans(), get_split_cols_RDC(), create_histogram_leaf) return spn if __name__ == '__main__': data = np.loadtxt("test_data.txt", delimiter=";", dtype=np.int32) ds_context = Context(meta_types=["discrete"] * data.shape[1]) add_domains(data, ds_context)
def __init__(self): self.cachedir = os.path.dirname(os.path.realpath(__file__)) self.memory = Memory(cachedir=self.cachedir, verbose=0) self.client = slack.WebClient(token=SLACK_API_BOT_TOKEN)
def run(dataset, word2vec, epoch, frequency, gpu, out, model, batchsize, lr, fix_embedding, resume): """ Train multi-domain user review classification using Blitzer et al.'s dataset (https://www.cs.jhu.edu/~mdredze/datasets/sentiment/) Please refer README.md for details. """ memory = Memory(cachedir=out, verbose=1) w2v, vocab, train_dataset, dev_dataset, _, label_dict, domain_dict = \ memory.cache(prepare_blitzer_data)(dataset, word2vec) if model == 'rnn': model = multidomain_sentiment.models.create_rnn_predictor( len(domain_dict), w2v.shape[0], w2v.shape[1], 300, len(label_dict), 2, 300, dropout_rnn=0.1, initialEmb=w2v, dropout_emb=0.1, fix_embedding=fix_embedding) elif model == 'cnn': model = multidomain_sentiment.models.create_cnn_predictor( len(domain_dict), w2v.shape[0], w2v.shape[1], 300, len(label_dict), 300, dropout_fc=0.1, initialEmb=w2v, dropout_emb=0.1, fix_embedding=fix_embedding) else: assert not "should not get here" classifier = multidomain_sentiment.models.MultiDomainClassifier( model, domain_dict=domain_dict) if gpu >= 0: # Make a specified GPU current chainer.cuda.get_device_from_id(gpu).use() classifier.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam(alpha=lr) optimizer.setup(classifier) train_iter = chainer.iterators.SerialIterator(train_dataset, batchsize) # Set up a trainer updater = training.StandardUpdater( train_iter, optimizer, device=gpu, converter=multidomain_sentiment.training.convert) if dev_dataset is not None: stop_trigger = EarlyStoppingTrigger(monitor='validation/main/loss', max_trigger=(epoch, 'epoch')) trainer = training.Trainer(updater, stop_trigger, out=out) logger.info("train: {}, dev: {}".format(len(train_dataset), len(dev_dataset))) # Evaluate the model with the development dataset for each epoch dev_iter = chainer.iterators.SerialIterator(dev_dataset, batchsize, repeat=False, shuffle=False) evaluator = extensions.Evaluator( dev_iter, classifier, device=gpu, converter=multidomain_sentiment.training.convert) trainer.extend(evaluator, trigger=frequency) # This works together with EarlyStoppingTrigger to provide more reliable # early stopping trainer.extend(SaveRestore(), trigger=chainer.training.triggers.MinValueTrigger( 'validation/main/loss')) else: trainer = training.Trainer(updater, (epoch, 'epoch'), out=out) logger.info("train: {}".format(len(train_dataset))) # SaveRestore will save the snapshot when dev_dataset is available trainer.extend(extensions.snapshot(), trigger=frequency) logger.info("With labels: %s" % json.dumps(label_dict)) # Take a snapshot for each specified epoch if gpu < 0: # ParameterStatistics does not work with GPU as of chainer 2.x # https://github.com/chainer/chainer/issues/3027 trainer.extend(extensions.ParameterStatistics(model, trigger=(100, 'iteration')), priority=99) # Write a log of evaluation statistics for each iteration trainer.extend(extensions.LogReport(trigger=(1, 'iteration')), priority=98) trainer.extend(extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy' ]), trigger=frequency, priority=97) if resume: # Resume from a snapshot chainer.serializers.load_npz(resume, trainer) logger.info("Started training") trainer.run() # Save final model (without trainer) chainer.serializers.save_npz(os.path.join(out, 'trained_model'), model) with open(os.path.join(out, 'vocab.json'), 'w') as fout: json.dump(vocab, fout)
def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True): """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification). Download it if necessary. ================= ======================= Classes 5749 Samples total 13233 Dimensionality 5828 Features real, between 0 and 255 ================= ======================= In the official `README.txt`_ this task is described as the "Restricted" task. As I am not sure as to implement the "Unrestricted" variant correctly, I left it as unsupported for now. .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt The original images are 250 x 250 pixels, but the default slice and resize arguments reduce them to 62 x 47. Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`. Parameters ---------- subset : optional, default: 'train' Select the dataset to load: 'train' for the development training set, 'test' for the development test set, and '10_folds' for the official evaluation set that is meant to be used with a 10-folds cross validation. data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. funneled : boolean, optional, default: True Download and use the funneled variant of the dataset. resize : float, optional, default 0.5 Ratio used to resize the each face picture. color : boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than the shape with color = False. slice_ : optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- The data is returned as a Bunch object with the following attributes: data : numpy array of shape (2200, 5828). Shape depends on ``subset``. Each row corresponds to 2 ravel'd face images of original size 62 x 47 pixels. Changing the ``slice_``, ``resize`` or ``subset`` parameters will change the shape of the output. pairs : numpy array of shape (2200, 2, 62, 47). Shape depends on ``subset`` Each row has 2 face images corresponding to same or different person from the dataset containing 5749 people. Changing the ``slice_``, ``resize`` or ``subset`` parameters will change the shape of the output. target : numpy array of shape (2200,). Shape depends on ``subset``. Labels associated to each pair of images. The two label values being different persons or the same person. DESCR : string Description of the Labeled Faces in the Wild (LFW) dataset. """ lfw_home, data_folder_path = _check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) logger.debug('Loading %s LFW pairs from %s', subset, lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib m = Memory(cachedir=lfw_home, compress=6, verbose=0) else: m = Memory(location=lfw_home, compress=6, verbose=0) load_func = m.cache(_fetch_lfw_pairs) # select the right metadata file according to the requested subset label_filenames = { 'train': 'pairsDevTrain.txt', 'test': 'pairsDevTest.txt', '10_folds': 'pairs.txt', } if subset not in label_filenames: raise ValueError("subset='%s' is invalid: should be one of %r" % (subset, list(sorted(label_filenames.keys())))) index_file_path = join(lfw_home, label_filenames[subset]) # load and memoize the pairs as np arrays pairs, target, target_names = load_func(index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_) module_path = dirname(__file__) with open(join(module_path, 'descr', 'lfw.rst')) as rst_file: fdescr = rst_file.read() # pack the results as a Bunch instance return Bunch(data=pairs.reshape(len(pairs), -1), pairs=pairs, target=target, target_names=target_names, DESCR=fdescr)
def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, min_faces_per_person=0, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True, return_X_y=False): """Load the Labeled Faces in the Wild (LFW) people dataset \ (classification). Download it if necessary. ================= ======================= Classes 5749 Samples total 13233 Dimensionality 5828 Features real, between 0 and 255 ================= ======================= Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. funneled : boolean, optional, default: True Download and use the funneled variant of the dataset. resize : float, optional, default 0.5 Ratio used to resize the each face picture. min_faces_per_person : int, optional, default None The extracted dataset will only retain pictures of people that have at least `min_faces_per_person` different pictures. color : boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than the shape with color = False. slice_ : optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. return_X_y : boolean, default=False. If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch object. See below for more information about the `dataset.data` and `dataset.target` object. .. versionadded:: 0.20 Returns ------- dataset : dict-like object with the following attributes: dataset.data : numpy array of shape (13233, 2914) Each row corresponds to a ravelled face image of original size 62 x 47 pixels. Changing the ``slice_`` or resize parameters will change the shape of the output. dataset.images : numpy array of shape (13233, 62, 47) Each row is a face image corresponding to one of the 5749 people in the dataset. Changing the ``slice_`` or resize parameters will change the shape of the output. dataset.target : numpy array of shape (13233,) Labels associated to each face image. Those labels range from 0-5748 and correspond to the person IDs. dataset.DESCR : string Description of the Labeled Faces in the Wild (LFW) dataset. (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 """ lfw_home, data_folder_path = _check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) logger.debug('Loading LFW people faces from %s', lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib m = Memory(cachedir=lfw_home, compress=6, verbose=0) else: m = Memory(location=lfw_home, compress=6, verbose=0) load_func = m.cache(_fetch_lfw_people) # load and memoize the pairs as np arrays faces, target, target_names = load_func( data_folder_path, resize=resize, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_) X = faces.reshape(len(faces), -1) module_path = dirname(__file__) with open(join(module_path, 'descr', 'lfw.rst')) as rst_file: fdescr = rst_file.read() if return_X_y: return X, target # pack the results as a Bunch instance return Bunch(data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr)
import numpy as np from joblib import Memory from sklearn.preprocessing import OneHotEncoder cache = Memory('cache').cache @cache def get_embedding_dim(embedding_path): with open(embedding_path, 'rb') as f: return len(f.readline().split()) - 1 @cache def get_embedding_matrix(vocab, embedding_path): word2ind = {w: i for i, w in enumerate(vocab)} embedding_dim = get_embedding_dim(embedding_path) embeddings = np.random.normal(size=(len(vocab), embedding_dim)) with open(embedding_path, 'rb') as f: for line in f: parts = line.split() word = parts[0] if word in word2ind: i = word2ind[word] vec = np.array([float(x) for x in parts[1:]]) embeddings[i] = vec return embeddings
# pylint: disable=unused-argument, broad-except from typing import ( List, Optional, Union, ) from pathlib import Path from joblib import Memory from polyglot.text import Detector from logzero import logger from .seg_text import seg_text memory = Memory(location=Path("joblib_cache"), verbose=0) # fmt: off # @memory.cache(ignore=['debug']) def _sent_tokenizer( text: Union[str, List[str]], lang: Optional[str] = None, debug: bool = False, # when True, disable joblib.Memory.cache ) -> List[str]: # fmt: on """Tokenize str|List[str] to sents.""" if isinstance(text, str): text = [text] if lang is None:
# http://www.njtransit.com/sf/sf_servlet.srv?hdnPageAction=TripPlannerTo # http://www.njtransit.com/sf/sf_servlet.srv?hdnPageAction=TripPlannerServiceNearTo import lxml.html import requests from joblib import Memory from geolocate import geocode from utils import find_nearest_weekday # http://www.njtransit.com/rg/rg_servlet.srv?hdnPageAction=StationParkRideTo memory = Memory(cachedir='.cache', verbose=0) @memory.cache def plan_trip_inner(source, destination): source_geocoded = geocode(source) destination_geocoded = geocode(destination) departure_time = find_nearest_weekday().replace( hour=6, minute=0) # 6 am nearest weekday response = requests.post( "http://www.njtransit.com/sf/sf_servlet.srv?hdnPageAction=TripPlannerItineraryFrom", data={ "starting_street_address": source, "dest_street_address": destination,
dim=dim, noise_corr=noise_corr, sep=sep, score_error=(np.mean(this_scores) - validation_score), score_sem=(np.std(this_scores) / np.sqrt(len(this_scores))), )) return scores ############################################################################### # Run the simulations N_JOBS = -1 N_DRAWS = 1000 mem = Memory(cachedir='cache') results = pandas.DataFrame(columns=[ 'cv_name', 'validation_score', 'train_size', 'dim', 'noise_corr', 'sep', 'score_error', 'score_sem' ]) for dim, sep in [ (300, 5.), (10000, 60.), (10, .5), (1, .13), ]: if dim > 1000: # Avoid memory problems n_jobs = 20
def inner(): with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers) as executor: yield from executor.map(f, it) return inner() else: return map(f, it) def interruptible(fn, *args, **kwargs): """Run fn in another thread. This enables to keep processing signals (hence KeyboardInterrupt) when fn is a long-running non-python function that releases the GIL. """ executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) future = executor.submit(fn, *args, **kwargs) try: return future.result() except: future.cancel() executor.shutdown(wait=True) raise cache_dir = os.getenv("STRAPS_CACHE_DIR") if cache_dir: os.makedirs(cache_dir, exist_ok=True) # If cache_dir is None, Memory acts as a transparent wrapper. pdt_cache = Memory(cache_dir, verbose=0)
from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255
#!/bin/env python import os import numpy as np from sklearn.datasets.samples_generator import make_blobs from joblib import Memory _memory = Memory('.', verbose=1) DATA_DIR = os.path.expanduser('~/Desktop/datasets/nn-search') join = os.path.join class Random: UNIFORM = 'uniform' GAUSS = 'gauss' WALK = 'walk' BLOBS = 'blobs' class Gist: DIR = join(DATA_DIR, 'gist') TRAIN = join(DIR, 'gist_train.npy') # noqa TEST = join(DIR, 'gist.npy') # noqa TEST_100 = join(DIR, 'gist_100k.npy') # noqa TEST_200 = join(DIR, 'gist_200k.npy') # noqa QUERIES = join(DIR, 'gist_queries.npy') # noqa TRUTH = join(DIR, 'gist_truth.npy') # noqa class Sift1M:
from time import time from typing import Callable, Tuple from warnings import warn import cupy as cp import numpy as np from cupyx.scipy import linalg from joblib import Memory from numpy import linalg as np_linalg from GeneralEstimator import EstimatorDiscretize from Operator import Operator from decorators import timer location = './cachedir' memory = Memory(location, verbose=0, bytes_limit=1024 * 1024 * 1024) @memory.cache def numpy_svd(A_cpu: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: return np_linalg.svd(A_cpu, full_matrices=True, compute_uv=True, hermitian=True) class Landweber(EstimatorDiscretize, Operator): def __init__(self, kernel: Callable, lower: float, upper: float, grid_size: int, observations: np.ndarray, sample_size: int, adjoint: bool = False, quadrature: str = 'rectangle', **kwargs): """ Instance of Landweber solver for inverse problem in Poisson noise with integral operator. :param kernel: Kernel of the integral operator. :type kernel: Callable
import numpy as np from joblib import Memory location = './cachedir' memory = Memory(location, verbose=0) @memory.cache() def main(points_interest, T_space, axes): ''' Calculates the length and draws the lines for length of the butterfly wings. Parameters ---------- ax: array the array containing the 3 intermediary Axes. points_interest: array the array containing the four points of interest, each of which is a coordinate specifying the start/end point of the left/right wing. T_space: float number of pixels between 2 ticks. Returns ------- ax: ax an ax object dst_pix: tuple the tuple contains the distance of the left/right wing distance in pixels dst_mm: tuple