def upload_dataset(file, name, description, train, test, label, input, target): datasets_api_url = current_app.config['DATASETS_API'] storage_adapter_api_url = current_app.config['STORAGE_ADAPTER_API'] try: r = requests.post(storage_adapter_api_url, file, headers={'Content-Type': 'application/octet-stream'}) r.raise_for_status() except requests.exceptions.HTTPError as err: flash( 'Could not store file - got HTTPError from the Storage Adapter API', 'warning') return False storage_adapter_json = r.json() dataset_json = { 'name': name, 'description': description, 'train': train, # TODO: maybe it can just accept true false 'test': test, 'label': label, 'input': input, 'target': target, 'storage_adapter_uri': hateoas_get_link(storage_adapter_json, 'self'), 'binary_uri': hateoas_get_link(storage_adapter_json, 'binary'), 'user_id': current_user.id, 'project_id': 0 } try: r = requests.post(datasets_api_url, json=dataset_json) except requests.exceptions.HTTPError as err: flash('Could not store file - got HTTPError from Dataset API', 'warning') return False if not 200 <= r.status_code < 300: flash( 'Could not store file - Dataset API status code: {}'.format( r.status_code), 'warning') return False flash('Dataset uploaded', 'success') return True
def dataset_get(dataset_id): # TODO: A lot of it should be replaced -> df = h.dataset_id_to_df(dataset_id) dataset_single_uri = h.url_join(current_app.config['DATASETS_API'], dataset_id) r = requests.get(dataset_single_uri) if int(r.status_code) != 200: flash( 'Dataset API response code was {}, ' 'cannot fetch the dataset'.format(r.status_code), 'warning') return redirect(url_for('site_datasets.dataset_list')) json_data = json.loads(r.content) storage_binary_uri = h.hateoas_get_link(json_data, 'binary') label_url = h.url_join(request.url, 'al') #h.hateoas_get_link(json_data, 'label') # TODO get storage delte uri from dataset delete_url = url_for('site_datasets.dataset_delete', dataset_id=dataset_id) r_storage = requests.get(storage_binary_uri) if r_storage.status_code != 200: flash( 'Storage API response code was {}, cannot fetch the file'.format( r.status_code), 'warning') return redirect(url_for('site_datasets.dataset_list')) csv_content = r_storage.content try: df = pd.read_csv(io.StringIO(csv_content.decode('utf-8'))) except EmptyDataError: flash('Could not make a dataset out of the storage file - it is empty', 'warning') return redirect(url_for('site_datasets.dataset_list')) if df.empty: flash('Cannot show dataset - it is empty', 'warning') html_params = h.get_html_pagination_params(request.args, df) return render_template( 'datasets/info.html', info=json_data, data=html_params['page_data'], pagination=html_params['pagination'], download_url=storage_binary_uri, label_url=label_url, delete_url=delete_url, )
def dataset_id_to_df(dataset_id): datasets_api = current_app.config['DATASETS_API'] dataset_uri = h.url_join(datasets_api, dataset_id) r = requests.get(dataset_uri) r_dataset = r.json() storage_binary_uri = h.hateoas_get_link(r_dataset, 'binary') r_binary = requests.get(storage_binary_uri) # TODO: This is implementation for for text based problems only dataset = r_binary.content.decode('utf-8') csv = StringIO(dataset) return pd.read_csv(csv)
def post(self): # Train the model with a submited dataset and store it storage_adapter_api = current_app.config['STORAGE_ADAPTER_API'] datasets_api = current_app.config['DATASETS_API'] posted_json = request.get_json() name = posted_json['name'] description = posted_json['description'] dataset_id = posted_json['dataset_id'] algorithm_id = int(posted_json['algorithm_id']) user_id = posted_json['user_id'] project_id = posted_json['project_id'] dataset_uri = datasets_api + '/' + dataset_id r = requests.get(dataset_uri) r_dataset = r.json() storage_binary_uri = h.hateoas_get_link(r_dataset, 'binary') r_binary = requests.get(storage_binary_uri) # TODO: This is implementation for for text based problems only dataset = r_binary.content.decode('utf-8') csv = StringIO(dataset) df = pd.read_csv(csv) algorithm = algorithm_dict[algorithm_id](storage_adapter_api) algorithm.train(df.ix[:, 0], df.ix[:, 1]) algorithm.persist() trained_model = TrainedModel( name=name, description=description, algorithm_id=int(algorithm_id), dataset_id=dataset_id, user_id=user_id, project_id=project_id, ) for key in algorithm.resources: resource = ModelResource() resource.name = key resource.uri = algorithm.resources[key] trained_model.resources.append(resource) db.session.add(trained_model) db.session.commit() return redirect( url_for('machine_learning_models.single', model_id=trained_model.id))
def dataset_delete(dataset_id): dataset_api_url = current_app.config['DATASETS_API'] dataset_url = h.url_join(dataset_api_url, dataset_id) r = requests.get(dataset_url) json_data = json.loads(r.content) storage_url = h.hateoas_get_link(json_data, 'storage') # file_url = _get_link(json_data['links'], 'file') # TODO: get storage delete uri from dataset r_dataset = requests.delete(dataset_url) r_file = requests.delete(storage_url) flash('Not yet implemented', 'info') return redirect(url_for('.dataset_get', dataset_id=dataset_id))
def model(model_id): url = h.url_join(current_app.config['MODELS_API'], str(model_id)) r = requests.get(url) json_data = json.loads(r.content) url = h.hateoas_get_link(json_data, 'file') # TODO: fix this return ''' <p> Id: {}</br> Name: {}</br> Description:<br/> {}</br> <a href={}>link</a></p> '''.format(json_data['content']['id'], json_data['content']['name'], json_data['content']['description'], url)
def _persist_keras(self, model): # Save the model to an in-memory-only h5 file. _, tmp_path = tempfile.mkstemp() model.save(tmp_path) # with h5py.File( # 'does not matter', # driver='core', # backing_store=False) as h5file: # save_model(model, h5file) # h5file.flush() # Very important! Otherwise you get all zeroes below. # binary_data = h5file.fid.get_file_image() with open(tmp_path, 'rb') as f: binary_data = f.read() r = requests.post(self.storage_adapter_api, data=binary_data) storage_adapter_uri = h.hateoas_get_link(r.json(), 'self') self.resources['keras'] = storage_adapter_uri return storage_adapter_uri
def al(dataset_id): form_update = UpdateForm() form_generate = DownloadForm() if form_update.validate_on_submit(): r_dataset = h.get_dataset_response(dataset_id) dataset_json = r_dataset.json() storage_adapter_uri = h.hateoas_get_link(dataset_json, 'storage') f = form_update.file.data filename = secure_filename(f.filename) file_type = h.get_file_type(filename) if file_type == 'excel': df = pd.read_excel(f, encoding='utf-8') elif file_type != 'csv': raise ValueError('Expected Excel or csv file, received ' + file_type) else: df = pd.read_csv(f, encoding='utf-8') # # encoding = encoding_info_dict = chardet.detect(f) # sniffer = Sniffer() # line = f.readline().encode(encoding).decode('utf-8') # dialect = sniffer.sniff(line) # reader = csv.reader( # codecs.EncodedFile(f, "utf-8"), delimiter=',', dialect=dialect) # csv_data = reader csv_data = df.to_csv(index=False, encoding='utf-8').encode('utf-8') r_put = requests.put(storage_adapter_uri, data=csv_data) r_dataset.json() flash( "Updated dataset's file '{}' (id: {}) with '{}') ".format( dataset_json['name'], dataset_json['id'], filename), 'success') return redirect(url_for('.al', dataset_id=dataset_id)) if form_generate.validate_on_submit(): mark_for_labeling(dataset_id) df = h.dataset_id_to_df(dataset_id) if form_generate.csv_download.data: result = df.to_csv(encoding='utf-8') mimetype = "text/csv" filename = 'result.csv' elif form_generate.xls_download.data: output = io.BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1', encoding='utf-8') writer.save() result = output.getvalue() mimetype = "application/vnd.ms-excel" filename = 'result.xlsx' else: flash( 'Expected one of the buttons to be pushed, ' 'but that did not happen', 'warning') raise ValueError('Form response was neither csl nor xls') return Response(result, mimetype=mimetype, headers={ "Content-disposition": "attachment; filename={}".format(filename) }) return render_template('al/index.html', form_generate=form_generate, form_update=form_update)
def _persist_sklearn(self, model): model_pickle = pickle.dumps(model) r = requests.post(self.storage_adapter_api, data=model_pickle) storage_adapter_uri = h.hateoas_get_link(r.json(), 'self') self.resources['sklearn'] = storage_adapter_uri return storage_adapter_uri # TODO: not sure if useful
def _get_binary(uri): # TODO: could use another approach if multiple storage backends are used r_json = requests.get(uri).json() bin_uri = h.hateoas_get_link(r_json, 'binary') return requests.get(bin_uri).content