def _parse_data(self, _cache=True): # First look in self.request.files if len(self.request.files) > 0: dfs = [] for _, files in self.request.files.items(): for f in files: outpath = op.join(self.uploads_dir, f['filename']) with open(outpath, 'wb') as fout: fout.write(f['body']) if outpath.endswith('.json'): xdf = cache.open(outpath, pd.read_json) else: xdf = cache.open(outpath) dfs.append(xdf) os.remove(outpath) data = pd.concat(dfs, axis=0) # Otherwise look in request.body else: if self.request.headers.get('Content-Type', '') == 'application/json': try: data = pd.read_json(self.request.body.decode('utf8')) except ValueError: data = self.load_data() _cache = False else: data = pd.DataFrame.from_dict( parse_qs(self.request.body.decode('utf8'))) if _cache: self.store_data(data) if len(data) == 0: data = self.load_data() return data
def train_method(handler): """ Train, test dataset. Note that `handler.get_argument('arg')` is used to read URL parameters """ url = handler.get_argument('url') url = op.join(YAMLPATH, url) df = cache.open(url) # model, testSize and targetCol are part of the arguments sent via `train_method` AJAX call. clf = locate(handler.get_argument('model'))() test_size = float(handler.get_argument('testSize')) / 100 target_col = handler.get_argument('targetCol') dfy = df[target_col] dfx = df[[c for c in df if c != target_col]] x, y = dfx.values, dfy.values # train/test data split, fit to classifier and determine accuracy xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=test_size, shuffle=True, stratify=y) clf.fit(xtrain, ytrain) score = clf.score(xtest, ytest) # output is rendered to report.html with open(op.join(YAMLPATH, 'report.html'), 'r', encoding='utf8') as fout: tmpl = Template(fout.read()) viz = _make_chart(clf, dfx) return tmpl.generate(score=score, model=clf, spec=viz)
def _predict(self, data, score_col=False, transform=True): if transform: data = self._transform(data, deduplicate=False) self.model = cache.open(self.model_path, joblib.load) if score_col and score_col in data: target = data[score_col] data = data.drop([score_col], axis=1) return self.model.score(data, target) return self.model.predict(data)
def _check_model_path(self, error='raise'): if not op.exists(self.model_path): msg = f'No model found at {self.model_path}' if error == 'raise': raise HTTPError(NOT_FOUND, log_message=msg) else: import warnings warnings.warn(msg) if self.model is None: self.model = cache.open(self.model_path, joblib.load)
def get_answer(handler): '''getting answer ''' text = handler.get_arg('q', '') df = cache.open(file_path) tfidf = vectorizer.fit_transform( stem(s) for s in [text] + df['Question'].values.tolist()) similarity = cosine_similarity(tfidf[0:1], tfidf[1:])[0] top_index = similarity.argmax() return { 'similarity': similarity[top_index], 'question': df['Question'][top_index], 'answer': df['Answer'][top_index] }
def _predict(self, data=None, score_col=''): if data is None: data = self._parse_data(False) data = self._transform(data, drop_duplicates=False) self.model = cache.open(self.model_path, joblib.load) try: target = data.pop(score_col) return self.model.score(data, target) except KeyError: # Set data in the same order as the transformer requests data = data[self.model.named_steps['transform']._feature_names_in] data[self.get_opt('target_col', '_prediction')] = self.model.predict(data) return data
def _predict(self, data=None, score_col=''): if data is None: data = self._parse_data(False) data = self._transform(data, drop_duplicates=False) self.model = cache.open(self.model_path, joblib.load) try: target = data.pop(score_col) metric = self.get_argument('_metric', False) if metric: scorer = get_scorer(metric) return scorer(self.model, data, target) return self.model.score(data, target) except KeyError: # Set data in the same order as the transformer requests try: data = data[ self.model.named_steps['transform']._feature_names_in] data[self.get_opt('target_col', '_prediction')] = self.model.predict(data) except Exception as exc: app_log.exception(exc) return data
def setup(cls, data=None, model=None, config_dir='', **kwargs): cls.slug = slugify(cls.name) if not op.isdir(config_dir): config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', cls.slug) _mkdir(config_dir) cls.config_dir = config_dir cls.uploads_dir = op.join(config_dir, 'uploads') _mkdir(cls.uploads_dir) cls.config_store = cache.JSONStore(op.join(cls.config_dir, 'config.json'), flush=None) cls.data_store = op.join(cls.config_dir, 'data.h5') cls.template = kwargs.pop('template', True) super(MLHandler, cls).setup(**kwargs) if isinstance(data, str): data = cache.open(data) elif isinstance(data, dict): data = gdata.filter(**data) else: data = None if data is not None: cls.store_data(data) # parse model kwargs if model is None: model = {} default_model_path = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', slugify(cls.name) + '.pkl') model_path = model.pop('path', default_model_path) # store the model kwargs from gramex.yaml into the store for key in TRAINING_DEFAULTS: kwarg = model.get(key, False) if not cls.get_opt(key, False) and kwarg: cls.set_opt(key, kwarg) if op.exists(model_path): # If the pkl exists, load it cls.model = joblib.load(model_path) cls.model_path = model_path target_col = model.get('target_col', False) if target_col: cls.set_opt('target_col', target_col) else: target_col = cls.get_opt('target_col') else: # build the model mclass = cls.get_opt('class', model.get('class', False)) params = cls.get_opt('params', {}) if not params: params = model.get('params', {}) if mclass: cls.model = search_modelclass(mclass)(**params) cls.set_opt('class', mclass) else: cls.model = None # Params MUST come after class, or they will be ignored. cls.set_opt('params', params) if model_path: # if a path is specified, use to to store the model cls.model_path = model_path else: # or create our own path cls.model_path = default_model_path _mkdir(op.dirname(cls.model_path)) # train the model target_col = model.get('target_col', False) if target_col: cls.set_opt('target_col', target_col) else: target_col = cls.get_opt('target_col', False) if cls.model is not None and not target_col: app_log.warning('Target column not defined. Nothing to do.') else: if cls.model is not None: if data is not None: # filter columns data = cls._filtercols(data) # filter rows data = cls._filterrows(data) # assemble the pipeline if model.get('pipeline', True): cls.model = cls._get_pipeline(data) else: cls.model = search_modelclass(mclass)(**params) # train the model target = data[target_col] train = data[[c for c in data if c != target_col]] if model.get('async', True): gramex.service.threadpool.submit( _fit, cls.model, train, target, cls.model_path, cls.name) else: _fit(cls.model, train, target, cls.model_path, cls.name) cls.config_store.flush()
def _check_model_path(self): try: self.model = cache.open(self.model_path, joblib.load) except FileNotFoundError: raise HTTPError(NOT_FOUND, f'No model found at {self.model_path}')
def suggestion(handler): '''first 3 questions as suggestion''' return cache.open(file_path).sample(3)['Question'].to_json(orient='values')