Exemple #1
0
class AlveoSegmentationRoute(SegmenterWrapper):
    decorators = [
        auth_required,
        limiter.limit("15 per minute"),
        limiter.limit("150 per day")
    ]

    def _processor_get(self, user_id, remote_path):
        api_key = g.user.remote_api_key
        verify_access(remote_path, api_key)

        short_path = shorten_path(remote_path)

        result = get_binary_object(short_path)
        if result is None:
            result = segment_document(remote_path, api_key)
            if result is None:
                abort(400, 'Could not access requested document')
            else:
                create_binary_object(short_path, result)

        return result

    def _processor_post(self, user_id, audiofile):
        result = segment_audio_data(audiofile.read())
        if result is None:
            abort(400, "Uploaded file is not a valid .wav audio file.")

        return result
Exemple #2
0
class AlveoASRRetrieveJobRoute(RetrieveJobWrapper):
    decorators = [
        auth_required,
        limiter.limit("25 per minute"),
        limiter.limit("1000 per day")
    ]

    def _processor_get(self, user_id, job_id):
        jobs = job_query(user_id=user_id, job_id=job_id)

        if len(jobs) < 1:
            abort(404, "You have no job matching that job_id")

        job = jobs[0]
        status = job.status
        ds_object = job.datastore
        data = {
            "job_id": job_id,
            "status": JobTypes(status).name,
            "description": job.description
        }

        if status is JobTypes.FINISHED.value:
            data["result"] = export_asrdata(ds_object)

        return data
Exemple #3
0
class AlveoASRCancelJobRoute(CancelJobWrapper):
    decorators = [
        auth_required,
        limiter.limit("50 per minute"),
        limiter.limit("1000 per hour"),
        limiter.limit("5000 per day")
    ]

    def _processor_get(self, user_id, job_id):
        jobs = job_query(user_id=user_id, job_id=job_id)

        if len(jobs) < 1:
            abort(404, "You have no job matching that job_id")

        job_model = jobs[0]
        status = job_model.status

        if status is not JobTypes.QUEUED.value:
            abort(401, "Job ID `%s` is not queued" % job_id)

        job_object = redis_queue.fetch_job(job_model.external_id)

        if job_object is None:
            job_model.status = JobTypes.FAILED
            abort(
                401, "Job ID `%s` couldn't be found. Moving to 'failed' pool" %
                job_id)

        job_object.cancel()
        job_model.status = JobTypes.CANCELLED
        db.session.delete(job_model.datastore)
        job_model.datastore = None
        db.session.commit()

        return {"status": "cancelled", "job_id": job_id}
Exemple #4
0
class AlveoListByUserRoute(ListByUserWrapper):
    decorators = [
        auth_required,
        limiter.limit("75 per minute"),
        limiter.limit("1000 per hour"),
        limiter.limit("5000 per day")
    ]

    def _processor_get(self, user_id):
        return datastore_list(user_id=user_id)
Exemple #5
0
class AlveoExportByUserRoute(ExportByUserWrapper):
    decorators = [
        auth_required,
        limiter.limit("10 per minute"),
        limiter.limit("40 per hour"),
        limiter.limit("200 per day")
    ]

    def _processor_get(self, user_id):
        return datastore_export(user_id=user_id)
Exemple #6
0
class AlveoASRAddJobRoute(AddJobWrapper):
    decorators = [
        auth_required,
        limiter.limit("5 per minute"),
        limiter.limit("50 per day")
    ]

    def _processor_get(self, user_id, remote_path):
        api_key = g.user.remote_api_key
        verify_access(remote_path, api_key)

        short_path = shorten_path(remote_path)

        # Check cache first.
        #  Note/TODO: Does not check the job queue. Would that even be ideal?
        #   Another user could cancel their job. Jobs would need multiple authors.
        cached_asr = Datastore.query.filter(
            Datastore.key == '%s:%s:%s' % (DOMAIN, ENGINE, short_path)).filter(
                Datastore.user_id == g.user.id).first()
        if cached_asr is not None:
            if cached_asr.alias == "ready":
                return {
                    "status": "cached",
                    "result": export_asrdata(cached_asr)
                }
            else:
                return {
                    "status": "pending",
                    "job_id": cached_asr.alias.split(":")[1]
                }

        worker = redis_queue.enqueue(transcribe_document, remote_path, api_key)
        ds = Datastore(key="%s:%s:%s" % (DOMAIN, ENGINE, short_path),
                       value="{}",
                       storage_spec="asr-engine-gcloud/json/1.0",
                       user=g.user,
                       alias="init:%s" % worker.id)
        job = Job(external_id=worker.id,
                  user=g.user,
                  datastore=ds,
                  description="ASR via engine 'gcloud' for item: %s" %
                  short_path)
        db.session.add(ds)
        db.session.add(job)
        db.session.commit()

        return {"status": "queued", "job_id": job.id}

    def _processor_post(self, user_id, audiofile):
        result = transcribe(audiofile.read())
        if result is None:
            abort(400, "Uploaded file is not a valid .wav audio file.")

        return result
class AlveoSegmentationRoute(SegmenterWrapper):
    decorators = [
        auth_required,
        limiter.limit("15 per minute"),
        limiter.limit("150 per day")
    ]

    def _processor_get(self, user_id, remote_path):
        api_path = str(urlparse(remote_path).path)
        if '/' not in api_path or api_path == "/":
            abort(
                400,
                'Request did not include an Alveo document identifier to segment'
            )

        # We care more about the user itself than the user_id, another option
        # is to query the database for something that matches the key but that
        # would be slower
        api_key = g.user.remote_api_key

        alveo_metadata = get_module_metadata("alveo")
        api_url = alveo_metadata['api_url']
        client = pyalveo.Client(api_url=api_url,
                                api_key=api_key,
                                use_cache=False,
                                update_cache=False,
                                cache_dir=None)

        # Check if we can access the list first.
        # Would be good if we could just check Alveo permissions instead of retrieving the item directly.
        # https://github.com/Alveo/pyalveo/issues/11
        try:
            itemlist_path = remote_path.split('/document/')[0]
            itemlist = client.get_item(itemlist_path)
        except APIError as e:
            abort(400, "Response from remote host: \n" + str(e))

        result = get_cached_result(shorten_path(remote_path))
        if result is None:
            result = segment_document(remote_path, api_key)
            if result is None:
                abort(400, 'Could not access requested document')
            else:
                cache_result(shorten_path(remote_path), result)

        return result

    def _processor_post(self, user_id, audiofile):
        result = segment_audio_data(audiofile.read())
        if result is None:
            abort(400, "Uploaded file is not a valid .wav audio file.")

        return result
class AlveoStoreRoute(StoreWrapper):
    decorators = [
        auth_required,
        limiter.limit("1000 per minute"),
        limiter.limit("20000 per hour"),
        limiter.limit("100000 per day")
    ]

    def _processor_get(self, object_id, user_id=None, version=None):
        query = Datastore.query.filter(Datastore.id == object_id).first()

        if query is None:
            abort(404, 'No match for the provided id')

        user = User.query.filter(User.id == query.user_id).first()

        if not user.domain == DOMAIN:
            abort(
                403,
                'You don\'t have permission to read the storage of an external user'
            )

        base_query = query

        if version != None:
            try:
                query = query.versions[version]
            except:
                abort(404, 'Version doesn\'t exist for provided id')
        else:
            version = query.versions.count() - 1

        data = json.loads(query.value.decode())
        original_author = base_query.versions[0].user
        total_versions = base_query.versions.count()
        version_author = query.user

        return {
            'id': query.id,
            'key': str(query.key.split(':', 1)[1]),
            'version': version,
            'total_versions': total_versions,
            'transcription': data,
            'alias': query.alias,
            'annotations_total': len(data),
            'timestamp': query.timestamp.isoformat(),
            'storage_spec': query.storage_spec,
            'author': {
                'original': {
                    'ats_id': original_author.id,
                    'domain': original_author.domain,
                    'remote_id': original_author.remote_id
                },
                'version': {
                    'ats_id': version_author.id,
                    'domain': version_author.domain,
                    'remote_id': version_author.remote_id
                }
            }
        }

    def _processor_post(self, key, value, storage_spec, alias=None):
        if key is None or len(key) < 2:
            abort(400, 'Key is invalid or too short')

        if alias is None or len(alias) < 1:
            alias = "default"

        validate_data(value)

        key = '%s:%s' % (DOMAIN, key)

        model = Datastore.query.filter(Datastore.key == key).filter(
            Datastore.alias == alias).filter(
                Datastore.user_id == g.user.id).first()

        data = json.dumps(value)

        if model is None:
            model = Datastore(key, data, storage_spec, g.user, alias)
            db.session.add(model)
        else:
            model.set_value(data)
            model.storage_spec = storage_spec
            model.timestamp = datetime.datetime.now()

        db.session.commit()

        return {
            'id': model.id,
            'version': model.versions.count() - 1,
            'timestamp': model.timestamp,
            'alias': alias
        }