def get(self, model_id): # get sepcific model tm = TrainedModel.query.filter_by(id=model_id).first() content_dict = row_to_dict(tm) return { 'content': content_dict, 'links': [ { "rel": "self", "href": url_join( base_url=request.base_url, url=model_id ) }, { "rel": "file", "href": url_join( current_app.config['STORAGE_API'], content_dict['file_id'] ) } ] }
def dataset_get(dataset_id): # TODO: A lot of it should be replaced -> df = h.dataset_id_to_df(dataset_id) dataset_single_uri = h.url_join(current_app.config['DATASETS_API'], dataset_id) r = requests.get(dataset_single_uri) if int(r.status_code) != 200: flash( 'Dataset API response code was {}, ' 'cannot fetch the dataset'.format(r.status_code), 'warning') return redirect(url_for('site_datasets.dataset_list')) json_data = json.loads(r.content) storage_binary_uri = h.hateoas_get_link(json_data, 'binary') label_url = h.url_join(request.url, 'al') #h.hateoas_get_link(json_data, 'label') # TODO get storage delte uri from dataset delete_url = url_for('site_datasets.dataset_delete', dataset_id=dataset_id) r_storage = requests.get(storage_binary_uri) if r_storage.status_code != 200: flash( 'Storage API response code was {}, cannot fetch the file'.format( r.status_code), 'warning') return redirect(url_for('site_datasets.dataset_list')) csv_content = r_storage.content try: df = pd.read_csv(io.StringIO(csv_content.decode('utf-8'))) except EmptyDataError: flash('Could not make a dataset out of the storage file - it is empty', 'warning') return redirect(url_for('site_datasets.dataset_list')) if df.empty: flash('Cannot show dataset - it is empty', 'warning') html_params = h.get_html_pagination_params(request.args, df) return render_template( 'datasets/info.html', info=json_data, data=html_params['page_data'], pagination=html_params['pagination'], download_url=storage_binary_uri, label_url=label_url, delete_url=delete_url, )
def get(self): storage_api_url = current_app.config['STORAGE_API'] user_id = request.args.get('userid') project_id = request.args.get('projectid') query = Dataset.query if user_id: query = query.filter_by(user_id=user_id) if project_id: query = query.filter_by(project_id=project_id) datasets = query.all( ) # TODO: add admin and None if not logged in - security risk! content = [{ 'id': d.id, 'file_id': d.file_id, 'name': d.name, 'description': d.description, 'train': d.train, 'test': d.test, 'label': d.label, 'user_id': d.user_id, 'project_id': d.project_id, 'links': [{ "rel": "self", "href": url_join(request.base_url, d.id) }, { "rel": "file", "href": url_join(storage_api_url, d.file_id) }] } for d in datasets] self_href = (request.base_url + '?user_id={}'.format(user_id) if user_id else '') return { 'content': content, 'links': [{ "rel": "self", "href": self_href }] }
def get(self, dataset_id): storage_api_url = current_app.config['STORAGE_API'] # get sepcific model dataset = Dataset.query.filter_by(id=dataset_id).first() content = row_to_dict(dataset) return jsonify({ 'content': content, 'links': [{ "rel": "self", "href": url_join(request.base_url, dataset_id) }, { "rel": "file", "href": url_join(storage_api_url, content['file_id']) }] })
def post(self, model_id): posted_file = request.get_data('content') csv = StringIO(posted_file.decode('utf-8')) df = pd.read_csv(csv, encoding='utf-8') tm = TrainedModel.query.filter_by(id=model_id).first() storage_api = current_app.config['STORAGE_API'] r = requests.get('{}/{}'.format(storage_api, tm.file_id)) pickled_model = r.content trained_model = pickle.loads(pickled_model) y = trained_model.predict(df.ix[:, 0]) result = {'X': list(df.ix[:, 0].values), 'y': y.tolist()} self_href = url_join( base_url=request.base_url, url=url_for( 'machine_learning_models.single', model_id=model_id ) ) return { 'content': result, 'links': [ { "rel": "self", "href": self_href } ] }
def get(self, dataset_id): # Get initial dataset dataset_object = Dataset.query.filter_by(id=dataset_id).first() input_field = dataset_object.input target_field = dataset_object.target # label_batch_field = dataset_object.label_batch # TODO: Implement label batch field batch_field_name = 'batch' storage_uri = dataset_object.binary_uri r_storage = requests.get(storage_uri) dataset = r_storage.content.decode('utf-8') csv = StringIO(dataset) df = pd.read_csv(csv) sample_size = min(len(df) // 10, 100) if batch_field_name not in df.columns: sample_size = min(sample_size, len(df)) result_df = get_initial_label_set( df, input_field, n_clusters=sample_size, result_save_path= None, # TODO: Needs to be temp and removed instantly batch_field=batch_field_name) else: unlabeled_rows = df[(df[target_field].apply(pd.isnull))] marked_unlabeled_rows = df[(df[batch_field_name] != 0) & ((df[target_field] == '') | (df[target_field].apply(pd.isnull)))] total_marked_unlabeled = len(marked_unlabeled_rows) marked_labels_missing = total_marked_unlabeled != 0 all_labeled = len(unlabeled_rows) == 0 if marked_labels_missing or all_labeled: result_df = df else: result_df = mark_for_labeling(df, batch_field_name, sample_size) result_csv = result_df.to_csv(index=False, encoding='utf-8').encode('utf-8') r = requests.put(dataset_object.storage_adapter_uri, data=result_csv, verify=False # headers={'Content-Type': 'application/octet-stream'} ) self_href = url_join(request.base_url, dataset_id) return { 'content': result_df.to_json(), 'links': [{ "rel": "self", "href": self_href }] }
def get(self): # Get list of all models user_id = request.args.get('userid') project_id = request.args.get('projectid') query = TrainedModel.query if user_id: query = query.filter_by(user_id=user_id) if project_id: query = query.filter_by(project_id=project_id) trained_models = query.all() trained_models_dict = [ row_to_dict(row) for row in trained_models ] for model_dict in trained_models_dict: model_dict['links'] = [ { "rel": "self", "href": url_join( base_url=request.base_url, url=model_dict['id'] ) }, { "rel": "file", "href": url_join( base_url=current_app.config['STORAGE_API'], url=model_dict['file_id'] ) } ] return { 'content': trained_models_dict, 'links': [ { "rel": "self", "href": request.base_url } ] }
def get(self, dataset_id): # storage_api_url = current_app.config['STORAGE_API'] dataset = Dataset.query.filter_by(id=dataset_id).first() content = row_to_dict(dataset) # TODO: get uri form DB, not id return { 'id': dataset.id, 'name': dataset.name, 'description': dataset.description, 'do_train': dataset.train, 'do_test': dataset.test, 'do_label': dataset.label, 'user_id': dataset.user_id, 'project_id': dataset.project_id, 'links': [{ "rel": "self", "href": url_join(request.base_url, dataset.id) }, { "rel": "storage", "href": dataset.storage_adapter_uri }, { "rel": "label", "href": url_join(request.url_root, 'api', 'v1', 'active-learning') }, { "rel": "binary", "href": get_binary_uri(dataset) }] }
def dataset_id_to_df(dataset_id): datasets_api = current_app.config['DATASETS_API'] dataset_uri = h.url_join(datasets_api, dataset_id) r = requests.get(dataset_uri) r_dataset = r.json() storage_binary_uri = h.hateoas_get_link(r_dataset, 'binary') r_binary = requests.get(storage_binary_uri) # TODO: This is implementation for for text based problems only dataset = r_binary.content.decode('utf-8') csv = StringIO(dataset) return pd.read_csv(csv)
def get(self, model_id): # get sepcific model tm = TrainedModel.query.filter_by(id=model_id).first() content_dict = row_to_dict(tm) return { 'content': content_dict, 'links': [{ "rel": "self", "href": h.url_join(request.base_url, model_id) }] }
def dataset_delete(dataset_id): dataset_api_url = current_app.config['DATASETS_API'] dataset_url = url_join(dataset_api_url, dataset_id) r = requests.get(dataset_url) json_data = json.loads(r.content) file_url = _get_link(json_data['links'], 'file') r_dataset = requests.delete(dataset_url) r_file = requests.delete(file_url) flash('Not yet implemented', 'info') return redirect(url_for('.dataset_get', dataset_id=dataset_id))
def get_available_fields(dataset_id): dataset_api_url = current_app.config['DATASETS_API'] dataset_url = h.url_join(dataset_api_url, dataset_id) r = requests.get(dataset_url) content = json.loads(r.content) links = content['links'] file_url = None for l in links: if l['rel'] == 'binary': file_url = l['href'] break df = h.url_csv_to_df(file_url) return jsonify({c: c for c in df.columns})
def dataset_delete(dataset_id): dataset_api_url = current_app.config['DATASETS_API'] dataset_url = h.url_join(dataset_api_url, dataset_id) r = requests.get(dataset_url) json_data = json.loads(r.content) storage_url = h.hateoas_get_link(json_data, 'storage') # file_url = _get_link(json_data['links'], 'file') # TODO: get storage delete uri from dataset r_dataset = requests.delete(dataset_url) r_file = requests.delete(storage_url) flash('Not yet implemented', 'info') return redirect(url_for('.dataset_get', dataset_id=dataset_id))
def model(model_id): url = h.url_join(current_app.config['MODELS_API'], str(model_id)) r = requests.get(url) json_data = json.loads(r.content) url = h.hateoas_get_link(json_data, 'file') # TODO: fix this return ''' <p> Id: {}</br> Name: {}</br> Description:<br/> {}</br> <a href={}>link</a></p> '''.format(json_data['content']['id'], json_data['content']['name'], json_data['content']['description'], url)
def model(model_id): url = url_join(current_app.config['MODELS_API'], str(model_id)) r = requests.get(url) json_data = json.loads(r.content) url = _get_link(json_data['links'], 'file') # url = api_url + '/api/v1/storage/{}'.format(json_data['file_id']) return ''' <p> Id: {}</br> Name: {}</br> Description:<br/> {}</br> <a href={}>link</a></p> '''.format(json_data['content']['id'], json_data['content']['name'], json_data['content']['description'], url)
def post(self, model_id): """ Predict using trained model with selected data and fields :param model_id: ID of the model used for prediction :return: HATEOAS-like dict, where content is the df as JSON """ posted_json = request.get_json() df = dataset_id_to_df(posted_json['dataset_id']) x = df.get(posted_json['input_field'][0]) trained_model = TrainedModel.query.filter_by(id=model_id).first() algorithm = algorithm_dict[trained_model.algorithm_id]( storage_adapter_api=current_app.config['STORAGE_ADAPTER_API']) algorithm.resources = posted_json['resources'] algorithm.load() result = algorithm.predict(x) for idx, class_col in enumerate(result.T): # Append column idx if more than one column predicted_column_name = ('predicted' + '_{}'.format(idx) if result.shape[1] > 1 else '') # If column name already exists, append unique ID predicted_column_name += (str(uuid4()) if predicted_column_name in df.keys() else '') df[predicted_column_name] = class_col self_href = h.url_join(request.base_url, 'machine_learning_models.single', model_id) return { 'content': df.to_json(), 'links': [{ "rel": "self", "href": self_href }] }
def post_local(uri, bin_data): storage_id = requests.post(uri, bin_data).json() return h.url_join(uri, storage_id)
def predict(): models_api_url = current_app.config['MODELS_API'] params = {'userid': current_user.id} r = requests.get(models_api_url, params=params) models_response = r.json() model_file_url_dict = dict() model_choices = [] for model_dict in models_response['content']: model_id = str(model_dict['id']) choice = (model_id, model_dict['name']) model_choices.append(choice) model_file_url_dict[model_id] = [ l['href'] for l in model_dict['links'] if l['rel'] == 'self' ][0] form = PredictForm() form.model.choices = model_choices dataset_choices = get_user_datasets_choices(current_user.id) form.dataset.choices = dataset_choices if not dataset_choices or not model_choices: if not dataset_choices: flash('Upload some datasets first', 'warning') if not model_choices: flash('No models - try training some first', 'warning') return redirect(url_for('site_machine_learning.root')) if form.is_submitted(): selected_dataset_value = form.dataset.data else: selected_dataset_value = dataset_choices[0][0] fields_dict = get_available_fields(selected_dataset_value).json.items() form.input.choices = fields_dict if form.validate_on_submit(): # get dataset datasets_api_url = current_app.config['DATASETS_API'] r = requests.get(url_join(datasets_api_url, selected_dataset_value)) dataset_info = r.json() dataset_file_url = [ l['href'] for l in dataset_info['links'] if l['rel'] == 'file' ][0] df = url_csv_to_df(dataset_file_url) input_df = df[[form.input.data]] # run model model_file_url = model_file_url_dict[form.model.data] r = requests.post( model_file_url, data=input_df.to_csv(index=False, encoding='utf-8').encode('utf-8'), headers={'Content-type': 'text/plain; charset=utf-8'}) if r.status_code not in (200, 201): flash( 'Prediction failed - model API call ' 'returned code: {}'.format(r.status_code), 'warning') return redirect(url_for('site_machine_learning.predict')) result = r.json()['content'] result_df = DataFrame({'X': result['X'], 'y': result['y']}) predicted_column_name = 'predicted' if predicted_column_name in df.keys(): predicted_column_name += uuid4() df[predicted_column_name] = result_df['y'] export_type = form.type.data if export_type == 'excel': output = io.BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1', encoding='utf-8') writer.save() result = output.getvalue() mimetype = "application/vnd.ms-excel" filename = 'result.xlsx' elif export_type == 'csv': result = df.to_csv(encoding='utf-8') mimetype = "text/csv" filename = 'result.csv' else: flash('Nonexistent export type', 'danger') return redirect(url_for('.predict')) return Response(result, mimetype=mimetype, headers={ "Content-disposition": "attachment; filename={}".format(filename) }) return render_template('ml/predict_form.html', form=form, url=url_for('site_machine_learning.predict'))
def mark_for_labeling(dataset_id): dataset_api = current_app.config['DATASETS_API'] al_uri = h.url_join(dataset_api, dataset_id, 'al') return requests.get(al_uri)
def predict(): models_api_url = current_app.config['MODELS_API'] params = {'userid': current_user.id} r = requests.get(models_api_url, params=params) models_response = r.json() model_resources_dict = dict() model_choices = [] for model_dict in models_response['content']: model_id = str(model_dict['id']) choice = (model_id, model_dict['name']) model_choices.append(choice) # TODO: rename to resources or so model_resources_dict[model_id] = model_dict['resources'] form = PredictForm() form.model.choices = model_choices dataset_choices = get_user_datasets_choices(current_user.id) form.dataset.choices = dataset_choices if not dataset_choices or not model_choices: if not dataset_choices: flash('Upload some datasets first', 'warning') if not model_choices: flash('No models - try training some first', 'warning') return redirect(url_for('site_machine_learning.root')) if form.is_submitted(): selected_dataset_value = form.dataset.data else: selected_dataset_value = dataset_choices[0][0] fields_dict = get_available_fields(selected_dataset_value).json.items() form.input.choices = fields_dict if form.validate_on_submit(): post_json = { 'dataset_id': form.dataset.data, 'input_field': form.input.data, 'user_id': current_user.id, 'project_id': '', 'resources': model_resources_dict[form.model.data] } r = requests.post(h.url_join(models_api_url, form.model.data), json=post_json) if r.status_code not in (200, 201): flash( 'Prediction failed - model API call ' 'returned code: {}'.format(r.status_code), 'warning') return redirect(url_for('site_machine_learning.predict')) result = r.json()['content'] df = pd.read_json(result) export_type = form.type.data if export_type == 'excel': output = io.BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1', encoding='utf-8') writer.save() result = output.getvalue() mimetype = "application/vnd.ms-excel" filename = 'result.xlsx' elif export_type == 'csv': result = df.to_csv(encoding='utf-8') mimetype = "text/csv" filename = 'result.csv' else: flash('Nonexistent export type', 'danger') return redirect(url_for('.predict')) return Response(result, mimetype=mimetype, headers={ "Content-disposition": "attachment; filename={}".format(filename) }) return render_template('ml/predict_form.html', form=form, url=url_for('site_machine_learning.predict'))