Exemple #1
0
def repair_status():
    """
    implement a method, that repairs all dataset statuses.
    status should be json: {'annotate': [None, 'Starting', 'Processing', 'Lexonomy_Error', 'Ready'],
                            'ml': [None, 'Starting_ML', 'Lex2ML_Error', 'ML_Format', 'ML_Error', 'ML_Annotated', 'ML2Lex_Error', 'Lex_Format'],
                            'preview': [None, 'Starting', 'Processing', 'Lexonomy_Error', 'Ready'],
                            'download': [None, 'Preparing_download', 'Ready']}
    delete method after, leave status description
    """
    for dsid in range(0, 1000):
        try:
            dataset = Datasets.list_datasets(None, dsid=dsid)
            status = {
                'preview':
                None if dataset.lexonomy_ml_access is None else 'Ready',
                'ml':
                None if dataset.lexonomy_ml_access is None else 'Lex_Format',
                'annotate':
                None if dataset.lexonomy_access is None else 'Ready',
                'download': None
            }
            Datasets.dataset_status(dsid, set=True, status=status)
        except:
            continue
    return flask.make_response({'msg': 'ok'}, 200)
Exemple #2
0
def lexonomy_download(uid, dsid):
    if flask.request.headers.get('Authorization') != app.config['LEXONOMY_AUTH_KEY']:
        raise InvalidUsage("Shared secret is not valid!", status_code=401, enum='UNAUTHORIZED')

    ml = flask.request.args.get('ml', default="False", type=str) == "True"
    additional_pages = flask.request.args.get('add_pages', default="False", type=str) == "True"
    dataset = Datasets.list_datasets(uid, dsid=dsid)
    if ml:  # Set datasets status
        dataset.status['preview'] = 'Processing'
        Datasets.dataset_status(dsid, set=True, status=dataset.status)
    else:
        dataset.status['annotate'] = 'Processing'
        Datasets.dataset_status(dsid, set=True, status=dataset.status)

    temp_fname = dataset.xml_file_path.split(".xml")[0] + "-tmp.xml"

    @after_this_request
    def remove_file(response):
        os.remove(temp_fname)
        return response

    if ml:
        # Send ml file
        split_preview(dataset.xml_ml_out, temp_fname, 100)
        return flask.send_file(temp_fname, attachment_filename=dataset.xml_ml_out.split('/')[-1], as_attachment=True)

    elif not additional_pages:
        # Send first 20 pages file
        first_n_pages(dataset.xml_file_path, temp_fname, 20)
        return flask.send_file(temp_fname, attachment_filename=dataset.xml_file_path.split('/')[-1], as_attachment=True)
    else:
        # Send additional 20 pages file
        additional_n_pages(dataset.xml_file_path, dataset.xml_lex, temp_fname, 20)
        return flask.send_file(temp_fname, attachment_filename=dataset.xml_file_path.split('/')[-1], as_attachment=True)
Exemple #3
0
def ml_download(dsid):
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    dataset = Datasets.list_datasets(uid, dsid=dsid)

    # TODO: This checks can be replaced: if preview exists (is Ready), then get it from Lexonomy and download it
    # TODO: otherwise notify user to send ml output to preview
    # check if ml output is ready for download
    if dataset.xml_ml_out is None or dataset.xml_ml_out is '':
        raise InvalidUsage('No file for download. Try running ML first.',
                           status_code=409,
                           enum='STATUS_ERROR')
    elif dataset.status['ml'] in [
            None, 'Starting_ML', 'Lex2ML_Error', 'ML_Format', 'ML_Error',
            'ML_Annotated', 'ML2Lex_Error'
    ]:
        raise InvalidUsage(
            'File is not ready for download. Wait for ML to finish first.',
            status_code=409,
            enum='STATUS_ERROR')

    tmp_file = dataset.xml_ml_out.split(".xml")[0] + "_TEI.xml"

    # stop if already preparing download
    if dataset.status['download'] == 'Preparing_download':
        return flask.make_response(
            {
                'msg': 'Dataset is preparing for download',
                'status': dataset.status
            }, 200)
    # if download is ready, return file
    elif dataset.status['download'] == 'Ready':
        dataset.status['download'] = None
        Datasets.dataset_status(dsid, set=True, status=dataset.status)

        @after_this_request
        def after(response):
            response.headers['x-suggested-filename'] = filename
            response.headers.add('Access-Control-Expose-Headers', '*')
            os.remove(tmp_file)
            return response

        filename = dataset.name.split('.')[0] + '-transformed.xml'
        return flask.send_file(tmp_file,
                               attachment_filename=filename,
                               as_attachment=True,
                               conditional=True)

    # prepare download
    dataset.status['download'] = 'Preparing_download'
    Datasets.dataset_status(dsid, set=True, status=dataset.status)
    character_map = Datasets.dataset_character_map(dsid)
    prepare_TEI_download.apply_async(
        args=[dsid, dataset.xml_ml_out, tmp_file, character_map])
    return flask.make_response(
        {
            'msg': 'Dataset is preparing for download',
            'status': dataset.status['download']
        }, 200)
Exemple #4
0
def ml_run(dsid):
    """
    Dataset should be annotated at Lexonomy so we can download it and start ML process.
    ML statuses: Starting_ML -> ML_Format -> ML_Annotated -> Lex_Format
    Error statuses: Lex2ML_Error, ML_Error, ML2Lex_Error
    """
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    # get annotations first, so we get lex_xml path in db
    dataset = Datasets.list_datasets(uid, dsid=dsid)
    if dataset.status['annotate'] != 'Ready':
        raise InvalidUsage('File is not annotated at Lexonomy.',
                           status_code=409,
                           enum='STATUS_ERROR')
    get_lex_xml(uid, dsid)
    dataset = Datasets.list_datasets(uid, dsid=dsid)

    # deleting preview
    dataset.status['preview'] = None
    Datasets.dataset_add_ml_lexonomy_access(dsid)
    if dataset.lexonomy_ml_delete is not None:
        requests.post(dataset.lexonomy_ml_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })

    if dataset.status['ml'] in ['Starting_ML', 'ML_Format', 'ML_Annotated']:
        raise InvalidUsage('ML is already running.',
                           status_code=409,
                           enum='STATUS_ERROR')
    print_log(app.name, '{} Starting ML'.format(dataset))
    dataset.status['ml'] = 'Starting_ML'
    Datasets.dataset_status(dsid, set=True, status=dataset.status)
    # Get files ready
    xml_raw = dataset.xml_file_path
    xml_ml_out = dataset.xml_lex[:-4] + '-ML_OUT.xml'
    Datasets.dataset_add_ml_paths(dsid,
                                  xml_lex=dataset.xml_lex,
                                  xml_ml_out=xml_ml_out)
    # Run ml
    task = run_pdf2lex_ml_scripts.apply_async(
        args=[uid, dsid, xml_raw, dataset.xml_lex, xml_ml_out], countdown=0)
    Datasets.dataset_ml_task_id(dsid, set=True, task_id=task.id)
    return flask.make_response(
        {
            'message': 'ok',
            'dsid': dsid,
            'status': dataset.status['ml']
        }, 200)
Exemple #5
0
def make_lexonomy_request(dsid, request_data, ml=False):
    # Send request async and save links to db
    response = requests.post('https://{}/elexifier/new'.format(app.config['LEXONOMY_URL']),
                             headers={"Content-Type": 'application/json', "Authorization": app.config['LEXONOMY_AUTH_KEY']},
                             data=json.dumps(request_data))
    status = Datasets.dataset_status(dsid)
    resp_js = json.loads(response.text)
    if ml:
        if resp_js['error'] == 'email not found':
            status['preview'] = 'Lexonomy_Error'
            Datasets.dataset_status(dsid, set=True, status=status)
            return
        try:
            Datasets.dataset_add_ml_lexonomy_access(dsid, resp_js['access_link'], resp_js['edit_link'], resp_js['delete_link'], resp_js['status_link'])
            status['preview'] = 'Ready'
        except:
            status['preview'] = 'Lexonomy_Error'
    else:
        if resp_js['error'] == 'email not found':
            status['annotate'] = 'Lexonomy_Error'
            Datasets.dataset_status(dsid, set=True, status=status)
            return
        try:
            Datasets.dataset_add_lexonomy_access(dsid, resp_js['access_link'], resp_js['edit_link'], resp_js['delete_link'], resp_js['status_link'])
            status['annotate'] = 'Ready'
        except:
            status['annotate'] = 'Lexonomy_Error'
    Datasets.dataset_status(dsid, set=True, status=status)
    return
Exemple #6
0
def ds_sendML_to_lexonomy(uid, dsid):
    user = User.query.filter_by(id=uid).first()
    dataset = Datasets.list_datasets(uid, dsid=dsid)

    if dataset.lexonomy_ml_delete is not None:
        requests.post(dataset.lexonomy_ml_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })

    request_data = {
        'xml_file':
        '/api/lexonomy/' + str(uid) + '/download/' + str(dsid) + "?ml=True",
        'email': user.email,
        'filename': dataset.name + ' - preview',
        'type': 'preview',
        'url': app.config['URL'],
        'return_to': ""  # remove if no longer required
    }

    if user.sketch_engine_uid is not None:  # ske user
        request_data['ske_user'] = True
    else:
        request_data['ske_user'] = False

    print('Starting asynchronous request to Lexonomy')
    make_lexonomy_request.apply_async(args=[dsid, request_data],
                                      kwargs={"ml": True},
                                      countdown=0)

    # Update dataset status
    status = Datasets.dataset_status(dsid)
    status['preview'] = 'Starting'
    Datasets.dataset_status(dsid, set=True, status=status)
    msg = 'OK'
    return flask.make_response(
        {
            'message': msg,
            'dsid': dsid,
            'status': status['preview'],
            'test_request': request_data
        }, 200)
Exemple #7
0
def run_pdf2lex_ml_scripts(uid, dsid, xml_raw, xml_lex, xml_out):
    # Create files
    temp_fname = xml_raw.split('.xml')[0]
    json_ml_in = temp_fname + '-ML-IN.json'
    json_ml_out = temp_fname + '-ML-OUT.json'
    open(json_ml_in, 'a').close()
    open(json_ml_out, 'a').close()
    open(xml_out, 'a').close()

    def clean_files():
        os.remove(json_ml_in)
        os.remove(json_ml_out)

    status = Datasets.dataset_status(dsid)
    print_log('celery', 'Dictionary: {} @xml2json_ML'.format(dsid))  # step 1
    try:
        xml2json(xml_raw, xml_lex, json_ml_in)
        status['ml'] = 'ML_Format'
        Datasets.dataset_status(dsid, set=True, status=status)
    except Exception as e:
        status['ml'] = 'Lex2ML_Error'
        Datasets.dataset_status(dsid, set=True, status=status)
        Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
        print_log('celery', 'Dictionary: {} @xml2json_ML [ERROR]'.format(dsid))
        clean_files()
        ErrorLog.add_error_log(db,
                               dsid,
                               tag='ml_error',
                               message=traceback.format_exc())
        return

    print_log('celery', 'Dictionary: {} @train_ML'.format(dsid))  # step 2
    try:
        _, report = train_ML(json_ml_in, json_ml_out, '')
        ErrorLog.add_error_log(db, dsid, tag='ml_finished', message=report)
        status['ml'] = 'ML_Annotated'
        Datasets.dataset_status(dsid, set=True, status=status)
    except Exception as e:
        status['ml'] = 'ML_Error'
        Datasets.dataset_status(dsid, set=True, status=status)
        Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
        print_log('celery', 'Dictionary: {} @train_ML [ERROR]'.format(dsid))
        clean_files()
        ErrorLog.add_error_log(db,
                               dsid,
                               tag='ml_error',
                               message=traceback.format_exc())
        return

    print_log('celery', 'Dictionary: {} @json2xml_ML'.format(dsid))  # step 3
    try:
        json2xml(json_ml_out, xml_raw, xml_out)
        status['ml'] = 'Lex_Format'
        Datasets.dataset_status(dsid, set=True, status=status)
    except Exception as e:
        status['ml'] = 'ML2Lex_Error'
        Datasets.dataset_status(dsid, set=True, status=status)
        Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
        print_log('celery', 'Dictionary: {} @json2xml_ML [ERROR]'.format(dsid))
        clean_files()
        ErrorLog.add_error_log(db,
                               dsid,
                               tag='ml_error',
                               message=traceback.format_exc())
        return

    Datasets.dataset_ml_task_id(dsid, set=True, task_id="")
    clean_files()
    return
Exemple #8
0
def ds_send_to_lexonomy(dsid):
    token = flask.request.headers.get('Authorization')
    uid = verify_user(token)
    user = User.query.filter_by(id=uid).first()
    db.session.close()
    dataset = Datasets.list_datasets(uid, dsid=dsid)
    additional_pages = flask.request.args.get('add_pages',
                                              default='0',
                                              type=str).lower() == '1'

    if additional_pages:
        # get file from lexonomy and save it
        get_lex_xml(uid, dsid)

    # Reset dataset status and delete old files @Lexonomy
    dataset.status['ml'] = None
    dataset.status['preview'] = None
    if dataset.lexonomy_delete is not None:
        requests.post(dataset.lexonomy_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })
    if dataset.lexonomy_ml_delete is not None:
        requests.post(dataset.lexonomy_ml_delete,
                      headers={
                          "Content-Type": 'application/json',
                          "Authorization": app.config['LEXONOMY_AUTH_KEY']
                      })

    request_data = {
        'xml_file':
        '/api/lexonomy/{}/download/{}'.format(uid, dsid) +
        ('?add_pages=True' if additional_pages else ''),
        'email':
        user.email,
        'filename':
        dataset.name + ' - annotate',
        'type':
        'edit',
        'url':
        app.config['URL'],
        'ske_user':
        True if user.sketch_engine_uid is not None else False,
        'return_to':
        ""  # remove if no longer required
    }

    print_log(app.name,
              'Starting asynchronous request to Lexonomy {}'.format(dataset))
    make_lexonomy_request.apply_async(args=[dsid, request_data], countdown=0)

    # Update dataset status
    dataset.status['annotate'] = 'Starting'
    Datasets.dataset_status(dsid, set=True, status=dataset.status)

    return flask.make_response(
        {
            'message': 'OK',
            'dsid': dsid,
            'status': dataset.status['annotate'],
            'test_request': request_data
        }, 200)
Exemple #9
0
def prepare_TEI_download(dsid, input_file, output_file, character_map):
    # Load json for transformation
    json_file = os.path.join(app.config['APP_DIR'],
                             'modules/pdf2lex_ml/lexonomy_to_tei.json')
    with open(json_file, 'r') as file:
        json_data = file.read()
        file.close()

    transformation_json = json.loads(json_data)

    # clean tokens
    lexonomy_xml = lxml.etree.parse(input_file)
    if character_map is None:
        character_map = dict()
    clean_tokens(lexonomy_xml.getroot(), character_map)
    orig_xml = lxml.etree.tostring(lexonomy_xml)

    parserLookup = lxml.etree.ElementDefaultClassLookup(
        element=transformator.TMyElement)
    myParser = lxml.etree.XMLParser()
    myParser.set_element_class_lookup(parserLookup)
    lexonomy_xml = lxml.etree.fromstring(orig_xml, parser=myParser)

    # init transformator
    mapping = transformator.TMapping(transformation_json)
    mapper = transformator.TMapper()

    # transform lexonomy format to tei format
    metadata = Datasets.dataset_metadata(dsid)
    out_TEI, out_aug = mapper.Transform(
        mapping,
        [],
        [lxml.etree.ElementTree(lexonomy_xml)],
        makeAugmentedInputTrees=True,
        stripForValidation=False,
        stripHeader=False,
        #stripDictScrap=True, # TODO: change when fixed
        stripDictScrap=False,
        headerTitle=False,
        headerPublisher=False,
        headerBibl=False,
        promoteNestedEntries=True,
        metadata=metadata)
    print_log('DEBUG', 'transformed')
    target_xml = '\n' + lxml.etree.tostring(
        out_TEI, pretty_print=True, encoding='unicode')
    print_log('DEBUG', 'in string')
    target_xml = target_xml.replace(
        '<entry xmlns:m="http://elex.is/wp1/teiLex0Mapper/meta" xmlns:a="http://elex.is/wp1/teiLex0Mapper/legacyAttributes" xmlns="http://www.tei-c.org/ns/1.0">',
        '<entry>')
    print_log('DEBUG', 'entry replaced')

    # writing transformed xml to file
    open(output_file, 'a').close()
    print_log('DEBUG', 'writing to file')
    with open(output_file, 'w') as out:
        out.write(target_xml)
        out.close()
    print_log('DEBUG', 'writing finished')
    status = Datasets.dataset_status(dsid)
    status['download'] = 'Ready'
    Datasets.dataset_status(dsid, set=True, status=status)
    return