Ejemplo n.º 1
0
 def _parse_data(self, _cache=True):
     # First look in self.request.files
     if len(self.request.files) > 0:
         dfs = []
         for _, files in self.request.files.items():
             for f in files:
                 outpath = op.join(self.uploads_dir, f['filename'])
                 with open(outpath, 'wb') as fout:
                     fout.write(f['body'])
                 if outpath.endswith('.json'):
                     xdf = cache.open(outpath, pd.read_json)
                 else:
                     xdf = cache.open(outpath)
                 dfs.append(xdf)
                 os.remove(outpath)
         data = pd.concat(dfs, axis=0)
     # Otherwise look in request.body
     else:
         if self.request.headers.get('Content-Type',
                                     '') == 'application/json':
             try:
                 data = pd.read_json(self.request.body.decode('utf8'))
             except ValueError:
                 data = self.load_data()
                 _cache = False
         else:
             data = pd.DataFrame.from_dict(
                 parse_qs(self.request.body.decode('utf8')))
     if _cache:
         self.store_data(data)
     if len(data) == 0:
         data = self.load_data()
     return data
Ejemplo n.º 2
0
def train_method(handler):
    """
    Train, test dataset.
    
    Note that `handler.get_argument('arg')` is used to read URL parameters
    """
    url = handler.get_argument('url')
    url = op.join(YAMLPATH, url)
    df = cache.open(url)
    # model, testSize and targetCol are part of the arguments sent via `train_method` AJAX call.
    clf = locate(handler.get_argument('model'))()
    test_size = float(handler.get_argument('testSize')) / 100
    target_col = handler.get_argument('targetCol')

    dfy = df[target_col]
    dfx = df[[c for c in df if c != target_col]]
    x, y = dfx.values, dfy.values

    # train/test data split, fit to classifier and determine accuracy
    xtrain, xtest, ytrain, ytest = train_test_split(x,
                                                    y,
                                                    test_size=test_size,
                                                    shuffle=True,
                                                    stratify=y)
    clf.fit(xtrain, ytrain)
    score = clf.score(xtest, ytest)

    # output is rendered to report.html
    with open(op.join(YAMLPATH, 'report.html'), 'r', encoding='utf8') as fout:
        tmpl = Template(fout.read())
    viz = _make_chart(clf, dfx)
    return tmpl.generate(score=score, model=clf, spec=viz)
Ejemplo n.º 3
0
 def _predict(self, data, score_col=False, transform=True):
     if transform:
         data = self._transform(data, deduplicate=False)
     self.model = cache.open(self.model_path, joblib.load)
     if score_col and score_col in data:
         target = data[score_col]
         data = data.drop([score_col], axis=1)
         return self.model.score(data, target)
     return self.model.predict(data)
Ejemplo n.º 4
0
 def _check_model_path(self, error='raise'):
     if not op.exists(self.model_path):
         msg = f'No model found at {self.model_path}'
         if error == 'raise':
             raise HTTPError(NOT_FOUND, log_message=msg)
         else:
             import warnings
             warnings.warn(msg)
     if self.model is None:
         self.model = cache.open(self.model_path, joblib.load)
Ejemplo n.º 5
0
def get_answer(handler):
    '''getting answer '''
    text = handler.get_arg('q', '')
    df = cache.open(file_path)
    tfidf = vectorizer.fit_transform(
        stem(s) for s in [text] + df['Question'].values.tolist())
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:])[0]
    top_index = similarity.argmax()
    return {
        'similarity': similarity[top_index],
        'question': df['Question'][top_index],
        'answer': df['Answer'][top_index]
    }
Ejemplo n.º 6
0
 def _predict(self, data=None, score_col=''):
     if data is None:
         data = self._parse_data(False)
     data = self._transform(data, drop_duplicates=False)
     self.model = cache.open(self.model_path, joblib.load)
     try:
         target = data.pop(score_col)
         return self.model.score(data, target)
     except KeyError:
         # Set data in the same order as the transformer requests
         data = data[self.model.named_steps['transform']._feature_names_in]
         data[self.get_opt('target_col',
                           '_prediction')] = self.model.predict(data)
         return data
Ejemplo n.º 7
0
 def _predict(self, data=None, score_col=''):
     if data is None:
         data = self._parse_data(False)
     data = self._transform(data, drop_duplicates=False)
     self.model = cache.open(self.model_path, joblib.load)
     try:
         target = data.pop(score_col)
         metric = self.get_argument('_metric', False)
         if metric:
             scorer = get_scorer(metric)
             return scorer(self.model, data, target)
         return self.model.score(data, target)
     except KeyError:
         # Set data in the same order as the transformer requests
         try:
             data = data[
                 self.model.named_steps['transform']._feature_names_in]
             data[self.get_opt('target_col',
                               '_prediction')] = self.model.predict(data)
         except Exception as exc:
             app_log.exception(exc)
         return data
Ejemplo n.º 8
0
    def setup(cls, data=None, model=None, config_dir='', **kwargs):
        cls.slug = slugify(cls.name)
        if not op.isdir(config_dir):
            config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps',
                                 'mlhandler', cls.slug)
            _mkdir(config_dir)
        cls.config_dir = config_dir
        cls.uploads_dir = op.join(config_dir, 'uploads')
        _mkdir(cls.uploads_dir)
        cls.config_store = cache.JSONStore(op.join(cls.config_dir,
                                                   'config.json'),
                                           flush=None)
        cls.data_store = op.join(cls.config_dir, 'data.h5')
        cls.template = kwargs.pop('template', True)
        super(MLHandler, cls).setup(**kwargs)
        if isinstance(data, str):
            data = cache.open(data)
        elif isinstance(data, dict):
            data = gdata.filter(**data)
        else:
            data = None
        if data is not None:
            cls.store_data(data)

        # parse model kwargs
        if model is None:
            model = {}

        default_model_path = op.join(gramex.config.variables['GRAMEXDATA'],
                                     'apps', 'mlhandler',
                                     slugify(cls.name) + '.pkl')
        model_path = model.pop('path', default_model_path)

        # store the model kwargs from gramex.yaml into the store
        for key in TRAINING_DEFAULTS:
            kwarg = model.get(key, False)
            if not cls.get_opt(key, False) and kwarg:
                cls.set_opt(key, kwarg)
        if op.exists(model_path):  # If the pkl exists, load it
            cls.model = joblib.load(model_path)
            cls.model_path = model_path
            target_col = model.get('target_col', False)
            if target_col:
                cls.set_opt('target_col', target_col)
            else:
                target_col = cls.get_opt('target_col')
        else:  # build the model
            mclass = cls.get_opt('class', model.get('class', False))
            params = cls.get_opt('params', {})
            if not params:
                params = model.get('params', {})
            if mclass:
                cls.model = search_modelclass(mclass)(**params)
                cls.set_opt('class', mclass)
            else:
                cls.model = None
            # Params MUST come after class, or they will be ignored.
            cls.set_opt('params', params)

            if model_path:  # if a path is specified, use to to store the model
                cls.model_path = model_path
            else:  # or create our own path
                cls.model_path = default_model_path
                _mkdir(op.dirname(cls.model_path))

            # train the model
            target_col = model.get('target_col', False)
            if target_col:
                cls.set_opt('target_col', target_col)
            else:
                target_col = cls.get_opt('target_col', False)
            if cls.model is not None and not target_col:
                app_log.warning('Target column not defined. Nothing to do.')
            else:
                if cls.model is not None:
                    if data is not None:
                        # filter columns
                        data = cls._filtercols(data)

                        # filter rows
                        data = cls._filterrows(data)

                        # assemble the pipeline
                        if model.get('pipeline', True):
                            cls.model = cls._get_pipeline(data)
                        else:
                            cls.model = search_modelclass(mclass)(**params)

                        # train the model
                        target = data[target_col]
                        train = data[[c for c in data if c != target_col]]
                        if model.get('async', True):
                            gramex.service.threadpool.submit(
                                _fit, cls.model, train, target, cls.model_path,
                                cls.name)
                        else:
                            _fit(cls.model, train, target, cls.model_path,
                                 cls.name)
        cls.config_store.flush()
Ejemplo n.º 9
0
 def _check_model_path(self):
     try:
         self.model = cache.open(self.model_path, joblib.load)
     except FileNotFoundError:
         raise HTTPError(NOT_FOUND, f'No model found at {self.model_path}')
Ejemplo n.º 10
0
def suggestion(handler):
    '''first 3 questions as suggestion'''
    return cache.open(file_path).sample(3)['Question'].to_json(orient='values')