def repair_status(): """ implement a method, that repairs all dataset statuses. status should be json: {'annotate': [None, 'Starting', 'Processing', 'Lexonomy_Error', 'Ready'], 'ml': [None, 'Starting_ML', 'Lex2ML_Error', 'ML_Format', 'ML_Error', 'ML_Annotated', 'ML2Lex_Error', 'Lex_Format'], 'preview': [None, 'Starting', 'Processing', 'Lexonomy_Error', 'Ready'], 'download': [None, 'Preparing_download', 'Ready']} delete method after, leave status description """ for dsid in range(0, 1000): try: dataset = Datasets.list_datasets(None, dsid=dsid) status = { 'preview': None if dataset.lexonomy_ml_access is None else 'Ready', 'ml': None if dataset.lexonomy_ml_access is None else 'Lex_Format', 'annotate': None if dataset.lexonomy_access is None else 'Ready', 'download': None } Datasets.dataset_status(dsid, set=True, status=status) except: continue return flask.make_response({'msg': 'ok'}, 200)
def lexonomy_download(uid, dsid): if flask.request.headers.get('Authorization') != app.config['LEXONOMY_AUTH_KEY']: raise InvalidUsage("Shared secret is not valid!", status_code=401, enum='UNAUTHORIZED') ml = flask.request.args.get('ml', default="False", type=str) == "True" additional_pages = flask.request.args.get('add_pages', default="False", type=str) == "True" dataset = Datasets.list_datasets(uid, dsid=dsid) if ml: # Set datasets status dataset.status['preview'] = 'Processing' Datasets.dataset_status(dsid, set=True, status=dataset.status) else: dataset.status['annotate'] = 'Processing' Datasets.dataset_status(dsid, set=True, status=dataset.status) temp_fname = dataset.xml_file_path.split(".xml")[0] + "-tmp.xml" @after_this_request def remove_file(response): os.remove(temp_fname) return response if ml: # Send ml file split_preview(dataset.xml_ml_out, temp_fname, 100) return flask.send_file(temp_fname, attachment_filename=dataset.xml_ml_out.split('/')[-1], as_attachment=True) elif not additional_pages: # Send first 20 pages file first_n_pages(dataset.xml_file_path, temp_fname, 20) return flask.send_file(temp_fname, attachment_filename=dataset.xml_file_path.split('/')[-1], as_attachment=True) else: # Send additional 20 pages file additional_n_pages(dataset.xml_file_path, dataset.xml_lex, temp_fname, 20) return flask.send_file(temp_fname, attachment_filename=dataset.xml_file_path.split('/')[-1], as_attachment=True)
def ml_download(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) dataset = Datasets.list_datasets(uid, dsid=dsid) # TODO: This checks can be replaced: if preview exists (is Ready), then get it from Lexonomy and download it # TODO: otherwise notify user to send ml output to preview # check if ml output is ready for download if dataset.xml_ml_out is None or dataset.xml_ml_out is '': raise InvalidUsage('No file for download. Try running ML first.', status_code=409, enum='STATUS_ERROR') elif dataset.status['ml'] in [ None, 'Starting_ML', 'Lex2ML_Error', 'ML_Format', 'ML_Error', 'ML_Annotated', 'ML2Lex_Error' ]: raise InvalidUsage( 'File is not ready for download. Wait for ML to finish first.', status_code=409, enum='STATUS_ERROR') tmp_file = dataset.xml_ml_out.split(".xml")[0] + "_TEI.xml" # stop if already preparing download if dataset.status['download'] == 'Preparing_download': return flask.make_response( { 'msg': 'Dataset is preparing for download', 'status': dataset.status }, 200) # if download is ready, return file elif dataset.status['download'] == 'Ready': dataset.status['download'] = None Datasets.dataset_status(dsid, set=True, status=dataset.status) @after_this_request def after(response): response.headers['x-suggested-filename'] = filename response.headers.add('Access-Control-Expose-Headers', '*') os.remove(tmp_file) return response filename = dataset.name.split('.')[0] + '-transformed.xml' return flask.send_file(tmp_file, attachment_filename=filename, as_attachment=True, conditional=True) # prepare download dataset.status['download'] = 'Preparing_download' Datasets.dataset_status(dsid, set=True, status=dataset.status) character_map = Datasets.dataset_character_map(dsid) prepare_TEI_download.apply_async( args=[dsid, dataset.xml_ml_out, tmp_file, character_map]) return flask.make_response( { 'msg': 'Dataset is preparing for download', 'status': dataset.status['download'] }, 200)
def ml_run(dsid): """ Dataset should be annotated at Lexonomy so we can download it and start ML process. ML statuses: Starting_ML -> ML_Format -> ML_Annotated -> Lex_Format Error statuses: Lex2ML_Error, ML_Error, ML2Lex_Error """ token = flask.request.headers.get('Authorization') uid = verify_user(token) # get annotations first, so we get lex_xml path in db dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.status['annotate'] != 'Ready': raise InvalidUsage('File is not annotated at Lexonomy.', status_code=409, enum='STATUS_ERROR') get_lex_xml(uid, dsid) dataset = Datasets.list_datasets(uid, dsid=dsid) # deleting preview dataset.status['preview'] = None Datasets.dataset_add_ml_lexonomy_access(dsid) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) if dataset.status['ml'] in ['Starting_ML', 'ML_Format', 'ML_Annotated']: raise InvalidUsage('ML is already running.', status_code=409, enum='STATUS_ERROR') print_log(app.name, '{} Starting ML'.format(dataset)) dataset.status['ml'] = 'Starting_ML' Datasets.dataset_status(dsid, set=True, status=dataset.status) # Get files ready xml_raw = dataset.xml_file_path xml_ml_out = dataset.xml_lex[:-4] + '-ML_OUT.xml' Datasets.dataset_add_ml_paths(dsid, xml_lex=dataset.xml_lex, xml_ml_out=xml_ml_out) # Run ml task = run_pdf2lex_ml_scripts.apply_async( args=[uid, dsid, xml_raw, dataset.xml_lex, xml_ml_out], countdown=0) Datasets.dataset_ml_task_id(dsid, set=True, task_id=task.id) return flask.make_response( { 'message': 'ok', 'dsid': dsid, 'status': dataset.status['ml'] }, 200)
def make_lexonomy_request(dsid, request_data, ml=False): # Send request async and save links to db response = requests.post('https://{}/elexifier/new'.format(app.config['LEXONOMY_URL']), headers={"Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY']}, data=json.dumps(request_data)) status = Datasets.dataset_status(dsid) resp_js = json.loads(response.text) if ml: if resp_js['error'] == 'email not found': status['preview'] = 'Lexonomy_Error' Datasets.dataset_status(dsid, set=True, status=status) return try: Datasets.dataset_add_ml_lexonomy_access(dsid, resp_js['access_link'], resp_js['edit_link'], resp_js['delete_link'], resp_js['status_link']) status['preview'] = 'Ready' except: status['preview'] = 'Lexonomy_Error' else: if resp_js['error'] == 'email not found': status['annotate'] = 'Lexonomy_Error' Datasets.dataset_status(dsid, set=True, status=status) return try: Datasets.dataset_add_lexonomy_access(dsid, resp_js['access_link'], resp_js['edit_link'], resp_js['delete_link'], resp_js['status_link']) status['annotate'] = 'Ready' except: status['annotate'] = 'Lexonomy_Error' Datasets.dataset_status(dsid, set=True, status=status) return
def ds_sendML_to_lexonomy(uid, dsid): user = User.query.filter_by(id=uid).first() dataset = Datasets.list_datasets(uid, dsid=dsid) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) request_data = { 'xml_file': '/api/lexonomy/' + str(uid) + '/download/' + str(dsid) + "?ml=True", 'email': user.email, 'filename': dataset.name + ' - preview', 'type': 'preview', 'url': app.config['URL'], 'return_to': "" # remove if no longer required } if user.sketch_engine_uid is not None: # ske user request_data['ske_user'] = True else: request_data['ske_user'] = False print('Starting asynchronous request to Lexonomy') make_lexonomy_request.apply_async(args=[dsid, request_data], kwargs={"ml": True}, countdown=0) # Update dataset status status = Datasets.dataset_status(dsid) status['preview'] = 'Starting' Datasets.dataset_status(dsid, set=True, status=status) msg = 'OK' return flask.make_response( { 'message': msg, 'dsid': dsid, 'status': status['preview'], 'test_request': request_data }, 200)
def run_pdf2lex_ml_scripts(uid, dsid, xml_raw, xml_lex, xml_out): # Create files temp_fname = xml_raw.split('.xml')[0] json_ml_in = temp_fname + '-ML-IN.json' json_ml_out = temp_fname + '-ML-OUT.json' open(json_ml_in, 'a').close() open(json_ml_out, 'a').close() open(xml_out, 'a').close() def clean_files(): os.remove(json_ml_in) os.remove(json_ml_out) status = Datasets.dataset_status(dsid) print_log('celery', 'Dictionary: {} @xml2json_ML'.format(dsid)) # step 1 try: xml2json(xml_raw, xml_lex, json_ml_in) status['ml'] = 'ML_Format' Datasets.dataset_status(dsid, set=True, status=status) except Exception as e: status['ml'] = 'Lex2ML_Error' Datasets.dataset_status(dsid, set=True, status=status) Datasets.dataset_ml_task_id(dsid, set=True, task_id="") print_log('celery', 'Dictionary: {} @xml2json_ML [ERROR]'.format(dsid)) clean_files() ErrorLog.add_error_log(db, dsid, tag='ml_error', message=traceback.format_exc()) return print_log('celery', 'Dictionary: {} @train_ML'.format(dsid)) # step 2 try: _, report = train_ML(json_ml_in, json_ml_out, '') ErrorLog.add_error_log(db, dsid, tag='ml_finished', message=report) status['ml'] = 'ML_Annotated' Datasets.dataset_status(dsid, set=True, status=status) except Exception as e: status['ml'] = 'ML_Error' Datasets.dataset_status(dsid, set=True, status=status) Datasets.dataset_ml_task_id(dsid, set=True, task_id="") print_log('celery', 'Dictionary: {} @train_ML [ERROR]'.format(dsid)) clean_files() ErrorLog.add_error_log(db, dsid, tag='ml_error', message=traceback.format_exc()) return print_log('celery', 'Dictionary: {} @json2xml_ML'.format(dsid)) # step 3 try: json2xml(json_ml_out, xml_raw, xml_out) status['ml'] = 'Lex_Format' Datasets.dataset_status(dsid, set=True, status=status) except Exception as e: status['ml'] = 'ML2Lex_Error' Datasets.dataset_status(dsid, set=True, status=status) Datasets.dataset_ml_task_id(dsid, set=True, task_id="") print_log('celery', 'Dictionary: {} @json2xml_ML [ERROR]'.format(dsid)) clean_files() ErrorLog.add_error_log(db, dsid, tag='ml_error', message=traceback.format_exc()) return Datasets.dataset_ml_task_id(dsid, set=True, task_id="") clean_files() return
def ds_send_to_lexonomy(dsid): token = flask.request.headers.get('Authorization') uid = verify_user(token) user = User.query.filter_by(id=uid).first() db.session.close() dataset = Datasets.list_datasets(uid, dsid=dsid) additional_pages = flask.request.args.get('add_pages', default='0', type=str).lower() == '1' if additional_pages: # get file from lexonomy and save it get_lex_xml(uid, dsid) # Reset dataset status and delete old files @Lexonomy dataset.status['ml'] = None dataset.status['preview'] = None if dataset.lexonomy_delete is not None: requests.post(dataset.lexonomy_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) if dataset.lexonomy_ml_delete is not None: requests.post(dataset.lexonomy_ml_delete, headers={ "Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY'] }) request_data = { 'xml_file': '/api/lexonomy/{}/download/{}'.format(uid, dsid) + ('?add_pages=True' if additional_pages else ''), 'email': user.email, 'filename': dataset.name + ' - annotate', 'type': 'edit', 'url': app.config['URL'], 'ske_user': True if user.sketch_engine_uid is not None else False, 'return_to': "" # remove if no longer required } print_log(app.name, 'Starting asynchronous request to Lexonomy {}'.format(dataset)) make_lexonomy_request.apply_async(args=[dsid, request_data], countdown=0) # Update dataset status dataset.status['annotate'] = 'Starting' Datasets.dataset_status(dsid, set=True, status=dataset.status) return flask.make_response( { 'message': 'OK', 'dsid': dsid, 'status': dataset.status['annotate'], 'test_request': request_data }, 200)
def prepare_TEI_download(dsid, input_file, output_file, character_map): # Load json for transformation json_file = os.path.join(app.config['APP_DIR'], 'modules/pdf2lex_ml/lexonomy_to_tei.json') with open(json_file, 'r') as file: json_data = file.read() file.close() transformation_json = json.loads(json_data) # clean tokens lexonomy_xml = lxml.etree.parse(input_file) if character_map is None: character_map = dict() clean_tokens(lexonomy_xml.getroot(), character_map) orig_xml = lxml.etree.tostring(lexonomy_xml) parserLookup = lxml.etree.ElementDefaultClassLookup( element=transformator.TMyElement) myParser = lxml.etree.XMLParser() myParser.set_element_class_lookup(parserLookup) lexonomy_xml = lxml.etree.fromstring(orig_xml, parser=myParser) # init transformator mapping = transformator.TMapping(transformation_json) mapper = transformator.TMapper() # transform lexonomy format to tei format metadata = Datasets.dataset_metadata(dsid) out_TEI, out_aug = mapper.Transform( mapping, [], [lxml.etree.ElementTree(lexonomy_xml)], makeAugmentedInputTrees=True, stripForValidation=False, stripHeader=False, #stripDictScrap=True, # TODO: change when fixed stripDictScrap=False, headerTitle=False, headerPublisher=False, headerBibl=False, promoteNestedEntries=True, metadata=metadata) print_log('DEBUG', 'transformed') target_xml = '\n' + lxml.etree.tostring( out_TEI, pretty_print=True, encoding='unicode') print_log('DEBUG', 'in string') target_xml = target_xml.replace( '<entry xmlns:m="http://elex.is/wp1/teiLex0Mapper/meta" xmlns:a="http://elex.is/wp1/teiLex0Mapper/legacyAttributes" xmlns="http://www.tei-c.org/ns/1.0">', '<entry>') print_log('DEBUG', 'entry replaced') # writing transformed xml to file open(output_file, 'a').close() print_log('DEBUG', 'writing to file') with open(output_file, 'w') as out: out.write(target_xml) out.close() print_log('DEBUG', 'writing finished') status = Datasets.dataset_status(dsid) status['download'] = 'Ready' Datasets.dataset_status(dsid, set=True, status=status) return