def ml_run(dsid): """ Dataset should be annotated at Lexonomy so we can download it and start ML process. ML statuses: Starting_ML -> ML_Format -> ML_Annotated -> Lex_Format Error statuses: Lex2ML_Error, ML_Error, ML2Lex_Error """ token = flask.request.headers.get('Authorization') uid = verify_user(token) # get annotations first, so we get lex_xml path in db dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.status['annotate'] != 'Ready': raise InvalidUsage('File is not annotated at Lexonomy.', status_code=409, enum='STATUS_ERROR') get_lex_xml(uid, dsid) dataset = Datasets.list_datasets(uid, dsid=dsid) # deleting preview dataset.status['preview'] = None Datasets.dataset_add_ml_lexonomy_access(dsid) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) if dataset.status['ml'] in ['Starting_ML', 'ML_Format', 'ML_Annotated']: raise InvalidUsage('ML is already running.', status_code=409, enum='STATUS_ERROR') print_log(app.name, '{} Starting ML'.format(dataset)) dataset.status['ml'] = 'Starting_ML' Datasets.dataset_status(dsid, set=True, status=dataset.status) # Get files ready xml_raw = dataset.xml_file_path xml_ml_out = dataset.xml_lex[:-4] + '-ML_OUT.xml' Datasets.dataset_add_ml_paths(dsid, xml_lex=dataset.xml_lex, xml_ml_out=xml_ml_out) # Run ml task = run_pdf2lex_ml_scripts.apply_async( args=[uid, dsid, xml_raw, dataset.xml_lex, xml_ml_out], countdown=0) Datasets.dataset_ml_task_id(dsid, set=True, task_id=task.id) return flask.make_response( { 'message': 'ok', 'dsid': dsid, 'status': dataset.status['ml'] }, 200)
def delete_ml(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) local = flask.request.args.get('local', default=None, type=str) == 'True' if local: try: print_log(app.name, 'Deleting local ML files: {}'.format(dataset)) json_ml_in = '/var/www/elexifier-api/app/media/ML-IN-{}.json'.format( str(dsid)) json_ml_out = '/var/www/elexifier-api/app/media/ML-OUT-{}.json'.format( str(dsid)) if dataset.xml_lex != "": os.remove(dataset.xml_lex) if dataset.xml_ml_out != "": os.remove(dataset.xml_ml_out) os.remove(json_ml_in) os.remove(json_ml_out) except: pass Datasets.dataset_add_ml_paths(dsid) else: print_log(app.name, 'Deleting Lexonomy preview file: {}'.format(dataset)) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) Datasets.dataset_add_ml_lexonomy_access(db, dsid) return flask.make_response({'message': 'OK'}, 200)
def lexonomy_download(uid, dsid): if flask.request.headers.get('Authorization') != app.config['LEXONOMY_AUTH_KEY']: raise InvalidUsage("Shared secret is not valid!", status_code=401, enum='UNAUTHORIZED') ml = flask.request.args.get('ml', default="False", type=str) == "True" additional_pages = flask.request.args.get('add_pages', default="False", type=str) == "True" dataset = Datasets.list_datasets(uid, dsid=dsid) if ml: # Set datasets status dataset.status['preview'] = 'Processing' Datasets.dataset_status(dsid, set=True, status=dataset.status) else: dataset.status['annotate'] = 'Processing' Datasets.dataset_status(dsid, set=True, status=dataset.status) temp_fname = dataset.xml_file_path.split(".xml")[0] + "-tmp.xml" @after_this_request def remove_file(response): os.remove(temp_fname) return response if ml: # Send ml file split_preview(dataset.xml_ml_out, temp_fname, 100) return flask.send_file(temp_fname, attachment_filename=dataset.xml_ml_out.split('/')[-1], as_attachment=True) elif not additional_pages: # Send first 20 pages file first_n_pages(dataset.xml_file_path, temp_fname, 20) return flask.send_file(temp_fname, attachment_filename=dataset.xml_file_path.split('/')[-1], as_attachment=True) else: # Send additional 20 pages file additional_n_pages(dataset.xml_file_path, dataset.xml_lex, temp_fname, 20) return flask.send_file(temp_fname, attachment_filename=dataset.xml_file_path.split('/')[-1], as_attachment=True)
def repair_status(): """ implement a method, that repairs all dataset statuses. status should be json: {'annotate': [None, 'Starting', 'Processing', 'Lexonomy_Error', 'Ready'], 'ml': [None, 'Starting_ML', 'Lex2ML_Error', 'ML_Format', 'ML_Error', 'ML_Annotated', 'ML2Lex_Error', 'Lex_Format'], 'preview': [None, 'Starting', 'Processing', 'Lexonomy_Error', 'Ready'], 'download': [None, 'Preparing_download', 'Ready']} delete method after, leave status description """ for dsid in range(0, 1000): try: dataset = Datasets.list_datasets(None, dsid=dsid) status = { 'preview': None if dataset.lexonomy_ml_access is None else 'Ready', 'ml': None if dataset.lexonomy_ml_access is None else 'Lex_Format', 'annotate': None if dataset.lexonomy_access is None else 'Ready', 'download': None } Datasets.dataset_status(dsid, set=True, status=status) except: continue return flask.make_response({'msg': 'ok'}, 200)
def ml_download(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) # TODO: This checks can be replaced: if preview exists (is Ready), then get it from Lexonomy and download it # TODO: otherwise notify user to send ml output to preview # check if ml output is ready for download if dataset.xml_ml_out is None or dataset.xml_ml_out is '': raise InvalidUsage('No file for download. Try running ML first.', status_code=409, enum='STATUS_ERROR') elif dataset.status['ml'] in [ None, 'Starting_ML', 'Lex2ML_Error', 'ML_Format', 'ML_Error', 'ML_Annotated', 'ML2Lex_Error' ]: raise InvalidUsage( 'File is not ready for download. Wait for ML to finish first.', status_code=409, enum='STATUS_ERROR') tmp_file = dataset.xml_ml_out.split(".xml")[0] + "_TEI.xml" # stop if already preparing download if dataset.status['download'] == 'Preparing_download': return flask.make_response( { 'msg': 'Dataset is preparing for download', 'status': dataset.status }, 200) # if download is ready, return file elif dataset.status['download'] == 'Ready': dataset.status['download'] = None Datasets.dataset_status(dsid, set=True, status=dataset.status) @after_this_request def after(response): response.headers['x-suggested-filename'] = filename response.headers.add('Access-Control-Expose-Headers', '*') os.remove(tmp_file) return response filename = dataset.name.split('.')[0] + '-transformed.xml' return flask.send_file(tmp_file, attachment_filename=filename, as_attachment=True, conditional=True) # prepare download dataset.status['download'] = 'Preparing_download' Datasets.dataset_status(dsid, set=True, status=dataset.status) character_map = Datasets.dataset_character_map(dsid) prepare_TEI_download.apply_async( args=[dsid, dataset.xml_ml_out, tmp_file, character_map]) return flask.make_response( { 'msg': 'Dataset is preparing for download', 'status': dataset.status['download'] }, 200)
def xf_list_all_transforms(): token = flask.request.headers.get('Authorization') uid = verify_user(token) datasets = Datasets.list_datasets(uid) transformations = [] for dataset in datasets: _transformations = controllers.list_transforms(dataset.id) for xf in _transformations: xf.name = dataset.name + '/' + xf.name transformations.append(Transformer.to_dict(xf)) return flask.make_response(flask.jsonify(transformations), 200)
def delete_lexonomy(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.lexonomy_delete is not None: requests.post(dataset.lexonomy_delete, headers={"Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY']}) Datasets.dataset_add_lexonomy_access(dsid) return flask.make_response({'message': 'OK'}, 200)
def prepare_download(uid, xfid, dsid, strip_ns, strip_header, strip_DictScrap): try: transformer = controllers.list_transforms(dsid, xfid=xfid) dataset = Datasets.list_datasets(uid, dsid=dsid) metadata = Datasets.dataset_metadata(dsid) xf = transformer.transform ds_path = dataset.file_path file_name = dataset.name header_Title = metadata['title'] header_Bibl = metadata['bibliographicCitation'] header_Publisher = metadata['publisher'] orig_xml = open(ds_path, 'rb').read() parserLookup = lxml.etree.ElementDefaultClassLookup( element=DictTransformator.TMyElement) myParser = lxml.etree.XMLParser() myParser.set_element_class_lookup(parserLookup) entity_xml = lxml.etree.fromstring(orig_xml, parser=myParser) mapping = DictTransformator.TMapping(xf) mapper = DictTransformator.TMapper() out_TEI, out_aug = mapper.Transform( mapping, [], [lxml.etree.ElementTree(entity_xml)], makeAugmentedInputTrees=True, stripForValidation=strip_ns, stripHeader=strip_header, stripDictScrap=strip_DictScrap, promoteNestedEntries=True, headerTitle=header_Title, headerPublisher=header_Publisher, headerBibl=header_Bibl, metadata=metadata) target_xml = lxml.etree.tostring(out_TEI, pretty_print=True, encoding='unicode') orig_fname, file_type = file_name.split('.') target_fname = orig_fname + '_' + str(xfid) + '_TEI.' + file_type target_path = os.path.join(app.config['APP_MEDIA'], target_fname) open(target_path, 'a').close() with open(target_path, 'w') as out: out.write(target_xml) out.close() controllers.transformer_download_status(xfid, set=True, download_status='Ready') except Exception as e: print(traceback.format_exc()) controllers.transformer_download_status(xfid, set=True) # reset status return return
def get_lex_xml(uid, dsid): dataset = Datasets.list_datasets(uid, dsid=dsid) xml_lex = dataset.xml_file_path[:-4] + "-LEX.xml" Datasets.dataset_add_ml_paths(dsid, xml_lex=xml_lex, xml_ml_out=dataset.xml_ml_out) request_headers = { "Authorization": app.config['LEXONOMY_AUTH_KEY'], "Content-Type": 'application/json' } response = requests.get(dataset.lexonomy_access, headers=request_headers) #data = re.search("<BODY.*<\/BODY>", response.text).group() f = open(xml_lex, "w") f.write(response.text) f.close() return
def ml_preview(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.status[ 'ml'] == 'Lex_Format' and dataset.xml_ml_out is None or dataset.xml_ml_out is '': raise InvalidUsage('No file for preview. Try running ML first.', status_code=409, enum='STATUS_ERROR') ds_sendML_to_lexonomy(uid, dsid) return flask.make_response( { 'message': 'ok', 'dsid': dsid, 'status': dataset.status }, 200)
def ds_list_datasets(): token = flask.request.headers.get('Authorization') mimetype = flask.request.args.get('mimetype') uid = verify_user(token) order = flask.request.args.get('order') if isinstance(order, str): order = order.upper() else: order = "ASC" if not isinstance(mimetype, str): mimetype = "text/xml" datasets = [ Datasets.to_dict(i) for i in controllers.list_datasets(uid, order=order, mimetype=mimetype) ] return flask.make_response(jsonify(datasets), 200)
def ds_sendML_to_lexonomy(uid, dsid): user = User.query.filter_by(id=uid).first() dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) request_data = { 'xml_file': '/api/lexonomy/' + str(uid) + '/download/' + str(dsid) + "?ml=True", 'email': user.email, 'filename': dataset.name + ' - preview', 'type': 'preview', 'url': app.config['URL'], 'return_to': "" # remove if no longer required } if user.sketch_engine_uid is not None: # ske user request_data['ske_user'] = True else: request_data['ske_user'] = False print('Starting asynchronous request to Lexonomy') make_lexonomy_request.apply_async(args=[dsid, request_data], kwargs={"ml": True}, countdown=0) # Update dataset status status = Datasets.dataset_status(dsid) status['preview'] = 'Starting' Datasets.dataset_status(dsid, set=True, status=status) msg = 'OK' return flask.make_response( { 'message': msg, 'dsid': dsid, 'status': status['preview'], 'test_request': request_data }, 200)
def ds_download2(xfid, dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) status = controllers.transformer_download_status(xfid) get_status = flask.request.args.get('status', default='false', type=str) == 'true' if get_status: return flask.make_response({'status': status}, 200) elif status is None: print_log( app.name, 'Transformed dataset download started uid: {0:s}, xfid: {1:s} , dsid: {2:s}' .format(str(uid), str(xfid), str(dsid))) strip_ns = flask.request.args.get('strip_ns', default='false', type=str) == 'true' strip_header = flask.request.args.get('strip_header', default='false', type=str) == 'true' strip_DictScrap = flask.request.args.get('strip_DictScrap', default='false', type=str) == 'true' strip_DictScrap = strip_ns # TODO: remove this, when added to FE # Check if transformer exists try: transform = controllers.list_transforms(dsid, xfid=xfid) xf = transform.transform except: raise InvalidUsage('Transformer does not exist.', status_code=409) if xf is None: # Not sure why this is needed here? return flask.make_response( { 'spec': None, 'entity_xml': None, 'output': None }, 200) else: # start download task prepare_download.apply_async(args=[ uid, xfid, dsid, strip_ns, strip_header, strip_DictScrap ], countdown=0) status = 'Processing' controllers.transformer_download_status(xfid, set=True, download_status=status) elif status == "Processing": return flask.make_response({'message': 'File is still processing'}, 200) elif status == "Ready": print_log( app.name, 'Transformed dataset download finished uid: {0:s}, xfid: {1:s} , dsid: {2:s}' .format(str(uid), str(xfid), str(dsid))) # return file and delete afterwards dataset = Datasets.list_datasets(uid, dsid=dsid) file_name, file_type = dataset.name.split('.') target_file_name = file_name + '_' + str(xfid) + '_TEI.' + file_type target_path = os.path.join(app.config['APP_MEDIA'], target_file_name) @after_this_request def remove_file(response): response.headers['x-suggested-filename'] = out_name response.headers.add('Access-Control-Expose-Headers', '*') os.remove(target_path) return response controllers.transformer_download_status(xfid, set=True) # reset status transform_name = controllers.list_transforms(dsid, xfid=xfid).name out_name = dataset.name[:-4] + '-' + transform_name + '.xml' return flask.send_file(target_path, attachment_filename=out_name, as_attachment=True) return flask.make_response({'message': 'ok', 'status': status}, 200)
def ds_upload_new_dataset(): token = flask.request.headers.get('Authorization') uid = verify_user(token) # file metadata = flask.request.form.get('metadata', None) dictname = flask.request.files.get('dictname', None) file_content = flask.request.files.get('file', None) total_filesize = flask.request.form.get('dztotalfilesize', None) dzuuid = flask.request.form.get('dzuuid', None) current_chunk = int(flask.request.form.get('dzchunkindex')) total_chunks = int(flask.request.form.get('dztotalchunkcount', None)) chunk_offset = int(flask.request.form.get('dzchunkbyteoffset', None)) # get file extension try: orig_filename = file_content.filename extension = '.' + file_content.filename.split('.')[-1] except AttributeError: orig_filename = 'Dictionary' extension = '.xml' filename = "tempFile_USER-{0:s}".format(str(uid)) + extension filepath = os.path.join(app.config['APP_MEDIA'], secure_filename(filename)) if os.path.exists(filepath) and current_chunk == 0: os.remove(filepath) raise InvalidUsage('File already exists.', status_code=400, enum='FILE_EXISTS') try: # write to file with open(filepath, 'ab') as f: f.seek(chunk_offset) f.write(file_content.stream.read()) except OSError: raise InvalidUsage( "Not sure why, but we couldn't write the file to disk.", status_code=500, enum="FILE_ERROR") if current_chunk != total_chunks: return flask.make_response( jsonify({ 'status': 'OK', 'filename': filename, 'current_chunk': current_chunk, 'total_chunks': total_chunks }), 200) else: # finish upload if os.path.getsize(filepath) != int(total_filesize): os.remove(filepath) raise InvalidUsage("Size mismatch.", status_code=500, enum="FILE_ERROR") else: new_random_name = generate_filename(filename) new_path = os.path.join(app.config['APP_MEDIA'], secure_filename(new_random_name)) os.rename(filepath, new_path) dsid = controllers.add_dataset(db, uid, total_filesize, orig_filename, new_path, dzuuid) controllers.dataset_metadata(dsid, set=True, metadata=metadata) # prepare dataset dataset = controllers.list_datasets(uid, dsid) if "pdf" in dataset.upload_mimetype: controllers.transform_pdf2xml.apply_async(args=[dsid]) else: controllers.clean_empty_namespace(dsid) controllers.map_xml_tags.apply_async(args=[dsid]) return flask.make_response(Datasets.to_dict(dataset), 200)
def ds_dataset_info(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = controllers.list_datasets(uid, dsid=dsid) dataset = Datasets.to_dict(dataset) return flask.make_response(jsonify(dataset), 200)
def ds_send_to_lexonomy(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) user = User.query.filter_by(id=uid).first() db.session.close() dataset = Datasets.list_datasets(uid, dsid=dsid) additional_pages = flask.request.args.get('add_pages', default='0', type=str).lower() == '1' if additional_pages: # get file from lexonomy and save it get_lex_xml(uid, dsid) # Reset dataset status and delete old files @Lexonomy dataset.status['ml'] = None dataset.status['preview'] = None if dataset.lexonomy_delete is not None: requests.post(dataset.lexonomy_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) request_data = { 'xml_file': '/api/lexonomy/{}/download/{}'.format(uid, dsid) + ('?add_pages=True' if additional_pages else ''), 'email': user.email, 'filename': dataset.name + ' - annotate', 'type': 'edit', 'url': app.config['URL'], 'ske_user': True if user.sketch_engine_uid is not None else False, 'return_to': "" # remove if no longer required } print_log(app.name, 'Starting asynchronous request to Lexonomy {}'.format(dataset)) make_lexonomy_request.apply_async(args=[dsid, request_data], countdown=0) # Update dataset status dataset.status['annotate'] = 'Starting' Datasets.dataset_status(dsid, set=True, status=dataset.status) return flask.make_response( { 'message': 'OK', 'dsid': dsid, 'status': dataset.status['annotate'], 'test_request': request_data }, 200)
def ds_machine_learning(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) xml_format = flask.request.args.get('xml_format', default=None, type=str) == 'True' get_file = flask.request.args.get('get_file', default=None, type=str) == 'True' run_ml = flask.request.args.get('run_ml', default=None, type=str) == 'True' send_file = flask.request.args.get('send_file', default=None, type=str) == 'True' # TODO: Save paths to DB dataset = Datasets.list_datasets(uid, dsid=dsid) xml_lex = dataset.xml_lex xml_raw = dataset.xml_file_path print('xml_lex:', xml_lex, 'xml_raw:', xml_raw) if xml_lex == None: xml_ml_out = None else: xml_ml_out = xml_lex[:-4] + "-ML_OUT.xml" Datasets.dataset_add_ml_paths(dsid, xml_lex=dataset.xml_lex, xml_ml_out=xml_ml_out) # Check if all params are None if xml_format is None and get_file is None and run_ml is None and send_file is None: raise InvalidUsage("Invalid API call. No params.", status_code=422, enum="GET_ERROR") # Check if to many params elif xml_format and (get_file or run_ml or send_file): raise InvalidUsage("Invalid API call. Can't work on file and send it.", status_code=422, enum="GET_ERROR") dataset.ml_task_id = Datasets.dataset_ml_task_id(dsid) status = dataset.status # Check if dataset has ml_task, then send status if dataset.ml_task_id: return flask.make_response( { "message": "File is still processing.", "dsid": dsid, "Status": status }, 200) # Check if user wants file and then return it if xml_format and status not in [ 'Starting_ML', 'ML_Format', 'ML_Annotated', 'Lex2ML_Error', 'ML_Error', 'ML2Lex_Error' ]: # TODO: get the latest annotated version from Lexonomy Datasets.update_dataset_status(dsid, 'Preparing_download') tmp_file = xml_ml_out.split(".xml")[0] + "_TEI.xml" character_map = Datasets.dataset_character_map(dsid) prepare_TEI_download(dsid, xml_ml_out, tmp_file, character_map) #tokenized2TEI(dsid, xml_ml_out, tmp_file, character_map) @after_this_request def after(response): response.headers['x-suggested-filename'] = filename response.headers.add('Access-Control-Expose-Headers', '*') Datasets.update_dataset_status(dsid, 'Lex_Format') os.remove(tmp_file) return response filename = dataset.name.split('.')[0] + '-transformed.xml' return flask.send_file(tmp_file, attachment_filename=filename, as_attachment=True) elif xml_format: raise InvalidUsage("File is not ready. Try running ML again", status_code=202, enum="STATUS_ERROR") # Run ML scripts if get_file: # Get file from Lexonomy status = "Lexonomy_Annotated" get_lex_xml(uid, dsid) Datasets.update_dataset_status(dsid, status) elif run_ml: status = "Starting_ML" Datasets.update_dataset_status(dsid, status) task = run_pdf2lex_ml_scripts.apply_async( args=[uid, dsid, xml_raw, xml_lex, xml_ml_out], countdown=0) Datasets.dataset_ml_task_id(dsid, set=True, task_id=task.id) elif send_file: # Send file to Lexonomy # stauts = "ML_Annotated_@Lexonomy" ds_sendML_to_lexonomy(uid, dsid) return flask.make_response( { "message": "OK", "dsid": dsid, "Status": status }, 200)
def ml_status(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) return flask.make_response({'dsid': dsid, 'status': dataset.status}, 200)