class ModelsApiPredict(Resource): @doc(description=dedent(""" Predict document categorization with a previously trained model **Parameters** - `max_result_categories` : the maximum number of categories in the results - `sort` : sort by the score of the most likely class - `ml_output` : type of the output in ['decision_function', 'probability'], only affects ML methods. - `nn_metric` : The similarity returned by nearest neighbor classifier in ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm']. - `min_score` : filter out results below a similarity threashold """)) @use_args({ 'max_result_categories': wfields.Int(missing=1), 'sort': wfields.Boolean(missing=False), 'ml_output': wfields.Str(missing='probability'), 'nn_metric': wfields.Str(missing='jaccard_norm'), 'min_score': wfields.Number(missing=-1) }) @marshal_with(CategorizationPredictSchema()) def get(self, mid, **args): sort = args.pop('sort') max_result_categories = args.pop('max_result_categories') min_score = args.pop("min_score") cat = _CategorizerWrapper(self._cache_dir, mid=mid) y_res, nn_res = cat.predict(**args) res = _CategorizerWrapper.to_dict( y_res, nn_res, cat.le.classes_, cat.fe.db.data, sort=sort, max_result_categories=max_result_categories, min_score=min_score) return res
class DBSCANClusteringApi(Resource): @doc(description=dedent(""" Compute clustering (DBSCAN) The option `use_hashing=False` must be set for the feature extraction. Recommended options for the data ingestion also include, `use_idf=1, sublinear_tf=0, binary=0`. **Parameters** - `parent_id`: `dataset_id` or `lsi_id` - `min_similarity`: The radius of the subcluster obtained by merging a new sample and the closest subcluster should be lesser than the threshold. Otherwise a new subcluster is started. See [sklearn.cluster.Birch](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html) - `nn_metric` : The similarity returned by nearest neighbor classifier in ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm']. - `min_samples`: (optional) int The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. """)) @use_args({ 'parent_id': wfields.Str(required=True), 'min_samples': wfields.Int(missing=10), 'min_similarity': wfields.Number(missing=0.75 ), # this corresponds approximately to threashold = 0.5 'nn_metric': wfields.Str(missing='jaccard_norm') }) @marshal_with(IDSchema()) def post(self, **args): from math import sqrt S_cos = _scale_cosine_similarity(args.pop('min_similarity'), metric=args.pop('nn_metric'), inverse=True) # cosine sim to euclidean distance eps = sqrt(2 * (1 - S_cos)) cl = _ClusteringWrapper(cache_dir=self._cache_dir, parent_id=args.pop('parent_id')) cl.dbscan(eps=eps, **args) return {'id': cl.mid}
class RateItemHandler(RequestHandler): executor = ThreadPoolExecutor(8) def initialize(self, SharedModel): self.SharedModel = SharedModel @run_on_executor() @use_args({'user_id': fields.Int(required=True), 'item_id': fields.Int(required=True), 'positive': fields.Int(required=False, missing=None), 'rate': fields.Number(required=False, missing=None)}, location='querystring') def post(self, reqargs): # saves information about user rate action # saves locally and writes out once per 30 minutes uid = reqargs['user_id'] iid = reqargs['item_id'] positive = reqargs['positive'] rate = reqargs['rate'] if all(map(lambda v: v is None, (positive, rate))): return self.write({'error': 'poitive or rate should be provided', 'args': reqargs}) mapper = get_mapper() model = get_model() uid, iid = list(map(int, (uid, iid))) uix = mapper.get_user_ix(uid) iix = mapper.get_item_ix(iid) # rate is binary, system needs 0..10 if positive is not None: rate = float(positive) * 10 elif rate is not None: rate = float(rate) * 10 else: return self.write({'error': 'rate or positive args should be provided'}) if iix == -1: return self.write({'error': 'unknown item id', 'args': reqargs}) if uix == -1: # unknown user # add user to mapper and update his data mapper.add_user_id(uid) uix = mapper.get_user_ix(uid) model.add_user(uix, user_views=None) view = pd.DataFrame.from_records([(iix, rate, uix)], columns='item_id rate user_id'.split()) model.orig_df = model.orig_df.append(view, ignore_index=True) # appending data to dataframe in runtime allows model to filter its recommendations # to aware already rated items. Nothing writes to filesystem cause of frequent model # recalculating. During recalculation, model pulls data from original database which # must contains all users rate actions # Every N user action its recommendations recalculates user_views = model.orig_df[model.orig_df.user_id == uix] if len(user_views) % config.recalculate_user_every_n == 0: print('recalc user') model.update_user_data(uix, user_views) return self.write({'status': 'ok'})
def suggested_tags_args(): """Defines and validates suggested tags params""" return { "team_id": fields.UUID(required=True), "tags": fields.List(fields.String(), missing=[]), "min_support": fields.Number(missing=0.25, validate=lambda val: val <= 1), "limit": fields.Integer(missing=3), }
class DupDetectionApiElement(Resource): @doc(description=dedent(""" Query duplicates **Parameters** - `distance` : int, default=2 Maximum number of differnet bits in the simhash (Simhash method only) - `n_rand_lexicons` : int, default=1 number of random lexicons used for duplicate detection (I-Match method only) - `rand_lexicon_ratio` : float, default=0.7 ratio of the vocabulary used in random lexicons (I-Match method only) - `nn_metric` : The similarity returned by nearest neighbor classifier in ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm']. """)) @use_args({ 'distance': wfields.Int(), 'n_rand_lexicons': wfields.Int(), 'rand_lexicon_ratio': wfields.Number(), 'nn_metric': wfields.Str(missing='jaccard_norm') }) @marshal_with(ClusteringSchema()) def get(self, mid, **args): nn_metric = args.pop('nn_metric') model = _DuplicateDetectionWrapper(cache_dir=self._cache_dir, mid=mid) cluster_id = model.query(**args) model._fit_X = model.pipeline.data # load the data y = model._merge_response(cluster_id) res = [] valid_keys = ['document_id', 'rendering_id', 'similarity'] for name, group in y.groupby('cluster_id'): if group.shape[0] <= 1: continue S_sim_mean, S_sim = model.centroid_similarity( group.index.values, nn_metric) group = group.assign(similarity=S_sim) row_docs = [] for idx, row in group.iterrows(): row_docs.append({ key: val for key, val in row.to_dict().items() if key in valid_keys }) row['documents'] = row_docs res.append({ 'documents': row_docs, 'cluster_id': name, 'cluster_similarity': S_sim_mean }) return {'data': res} @marshal_with(EmptySchema()) def delete(self, mid): model = _DuplicateDetectionWrapper(cache_dir=self._cache_dir, mid=mid) model.delete() return {}
class APIRecordResultSchema(JSONPyramidRequestSchema): """Schema for validating proxy requests to LTI Outcomes API for recording grades.""" lis_outcome_service_url = fields.Str(required=True) """URL provided by the LMS to submit grades or other results to.""" lis_result_sourcedid = fields.Str(required=True) """ Opaque identifier provided by the LMS to identify a submission. This typically encodes the assignment context and LMS user. """ score = fields.Number( required=True, validate=marshmallow.validate.Range(min=0, max=1) ) """
class SearchApi(Resource): @doc(description=dedent(""" Perform document search (if `parent_id` is a `dataset_id`) or a semantic search (if `parent_id` is a `lsi_id`). Parameters ---------- - `parent_id` : the id of the previous processing step (either `dataset_id` or `lsi_id`) - `query` : the seach query - `nn_metric` : The similarity returned by nearest neighbor classifier in ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm']. - `min_score` : filter out results below a similarity threashold - `max_results` : return only the first `max_results` documents - `sort` : sort the results by score """)) @use_args({ "parent_id": wfields.Str(required=True), "query": wfields.Str(required=True), 'nn_metric': wfields.Str(missing='jaccard_norm'), 'min_score': wfields.Number(missing=-1), 'sort': wfields.Boolean(missing=True), 'max_results': wfields.Int(), }) @marshal_with(SearchResponseSchema()) def post(self, **args): parent_id = args['parent_id'] model = _SearchWrapper(cache_dir=self._cache_dir, parent_id=parent_id) query = args['query'] scores = model.search(query, metric=args['nn_metric']) scores_pd = pd.DataFrame({ 'score': scores, 'internal_id': np.arange(model.fe.n_samples_, dtype='int') }) res = model.fe.db.render_dict(scores_pd) res = [row for row in res if row['score'] > args['min_score']] if 'max_results' in args: res = res[:args['max_results']] if args['sort']: res = sorted(res, key=lambda row: row['score'], reverse=True) return {'data': res}
class BirchClusteringApi(Resource): @doc(description=dedent(""" Compute birch clustering The option `use_hashing=False` must be set for the feature extraction. Recommended options for data ingestion also include, `use_idf=1, sublinear_tf=0, binary=0`. **Parameters** - `parent_id`: `dataset_id` or `lsi_id` - `n_clusters`: the number of clusters - `min_similarity`: The radius of the subcluster obtained by merging a new sample and the closest subcluster should be lesser than the threshold. Otherwise a new subcluster is started. See [sklearn.cluster.Birch](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html) - `branching_factor`: Maximum number of CF subclusters in each node. If a new samples enters such that the number of subclusters exceed the branching_factor then the node has to be split. The corresponding parent also has to be split and if the number of subclusters in the parent is greater than the branching factor, then it has to be split recursively. - `nn_metric` : The similarity returned by nearest neighbor classifier in ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm']. """)) @use_args({ 'parent_id': wfields.Str(required=True), 'n_clusters': wfields.Int(missing=150), 'branching_factor': wfields.Int(missing=50), 'min_similarity': wfields.Number(missing=0.75 ), # this corresponds approximately to threashold = 0.5 'nn_metric': wfields.Str(missing='jaccard_norm') }) @marshal_with(IDSchema()) def post(self, **args): from math import sqrt S_cos = _scale_cosine_similarity(args.pop('min_similarity'), metric=args.pop('nn_metric'), inverse=True) # cosine sim to euclidean distance threshold = sqrt(2 * (1 - S_cos)) cl = _ClusteringWrapper(cache_dir=self._cache_dir, parent_id=args.pop('parent_id')) cl.birch(threshold=threshold, **args) return {'id': cl.mid}
@use_args(_k_mean_clustering_api_post_args) @marshal_with(IDSchema()) def post(self, **args): cl = _ClusteringWrapper(cache_dir=self._cache_dir, parent_id=args['parent_id']) del args['parent_id'] labels = cl.k_means(**args) # TODO unused variable. Remove? return {'id': cl.mid} _birch_clustering_api_post_args = { 'parent_id': wfields.Str(required=True), 'n_clusters': wfields.Int(default=150), 'threshold': wfields.Number(), } class BirchClusteringApi(Resource): @use_args(_birch_clustering_api_post_args) @marshal_with(IDSchema()) def post(self, **args): cl = _ClusteringWrapper(cache_dir=self._cache_dir, parent_id=args['parent_id']) del args['parent_id'] cl.birch(**args) return {'id': cl.mid}
class SuccessListSchema(ma.Schema): results = fields.List(fields.Raw()) total = fields.Number()
from webargs.flaskparser import use_args from datetime import datetime from models import Project, ProjectSchema from controllers import ProjectController, AppSecurityController from errors import ProjectControllerError, AppSecurityControllerError auth = HTTPBasicAuth() app_security_controller = AppSecurityController() project_controller = ProjectController() project_schema = ProjectSchema() projects_schema = ProjectSchema(many=True) # parameters from url url_args = { 'num_project_id': fields.Number(), 'num_resource_id': fields.Number() } @auth.get_password def get_password(oauth_user): try: resp = app_security_controller.get_password(oauth_user) log.debug('type res: {}'.format(type(resp))) log.debug('sending password ...') return resp except AppSecurityControllerError as e: log.error(e) return None
class AddPublisherSchema(ma.Schema): name = fields.Str(validate=validate.Length(min=1, max=100), required=True) comment = fields.Str(validate=validate.Length(min=1, max=1024), missing=None) airtime = fields.Number(validate=validate.Range(min=.1, max=100), missing=None)
class CalculateResponse(Schema): """Схема для сериализации ответа.""" amount_with_discount = fields.Number() total_amount = fields.Number()
if args['lsi_components'] < 0: args['lsi_components'] = None cl = Clustering(cache_dir=self._cache_dir, dsid=args['dataset_id']) del args['dataset_id'] labels = cl.k_means(**args) # TODO unused variable. Remove? return {'id': cl.mid} _birch_clustering_api_post_args = { 'dataset_id': wfields.Str(required=True), 'n_clusters': wfields.Int(required=True), 'lsi_components': wfields.Int(missing=-1), 'threshold': wfields.Number(), } class BirchClusteringApi(Resource): @use_args(_birch_clustering_api_post_args) @marshal_with(IDSchema()) def post(self, **args): if args['lsi_components'] < 0: args['lsi_components'] = None cl = Clustering(cache_dir=self._cache_dir, dsid=args['dataset_id']) del args['dataset_id'] cl.birch(**args) return {'id': cl.mid}