Exemple #1
0
def run_tool(tool_name: str, wav_file: Path, aux_file: any, output_file: Path):
    tmp_subdir = Path(mkdtemp(dir=tmp_dir))

    cmd = [
        'bash',
        str(speech_tools_path / 'tools' / tool_name / 'run.sh'), '--dist-path',
        str(speech_tools_path / 'dist'), '--tmp-path',
        str(tmp_subdir),
        str(wav_file)
    ]
    if aux_file:
        cmd.append(str(aux_file))
    cmd.append(str(output_file))

    with open(str(output_file) + '_log.txt', 'w') as log:
        logger.info(f'Running {" ".join(cmd)}')
        try:
            run(cmd,
                stdout=log,
                stderr=STDOUT,
                check=True,
                cwd=speech_tools_path)
        except CalledProcessError:
            raise RuntimeError(f'error running script for {output_file}')
        finally:
            if tmp_subdir.exists():
                rmtree(str(tmp_subdir))

    if not output_file.exists():
        raise RuntimeError(f'{output_file} missing')
Exemple #2
0
def package(work_dir: Path, project_id: str, db) -> Path:
    proj = db.clarin.emu.find_one({'_id': ObjectId(project_id)})
    if not proj:
        raise RuntimeError('project not found')

    if 'deleted' in proj:
        raise RuntimeError('project deleted')

    dir = Path(mkdtemp(suffix='_emuDB', dir=work_dir))
    proj_name = str(dir.name)[:-6]

    logger.info(f'Saving CTM in {dir} (zip)...')

    config = get_config(proj_name, feats)
    with open(dir / f'{proj_name}_DBconfig.json', 'w') as f:
        json.dump(config, f, indent=4)

    sessions = {}
    for bundle_id, bundle in proj['bundles'].iteritems():
        if 'audio' not in bundle or 'seg' not in bundle:
            continue

        b = {
            'name': bundle['name'],
            'audio': get_file(db, bundle['audio'], work_dir),
            'ctm': get_file(db, bundle['seg'], work_dir)
        }

        if not b['audio'] or not b['ctm']:
            continue

        sess = bundle['session']
        if sess not in sessions:
            sessions[sess] = []
        sessions[sess].append(b)

    for sess, bndls in sessions.items():
        sess_dir = dir / f'{sess}_ses'
        sess_dir.mkdir()
        for bndl in bndls:
            bndl_dir = sess_dir / f'{bndl["name"]}_bndl'
        bndl_dir.mkdir()
        bndl_basnam = bndl_dir / bndl['name']
        shutil.copy(bndl['audio'], bndl_basnam.with_suffix('.wav'))
        # save_annot(bndl['ctm'], bndl_basnam + u'_annot.json', bndl['name'])
        annot = segmentation_to_emu_annot(bndl['ctm'], bndl['name'])
        with open(bndl_basnam.with_suffix('_annot.json'), 'w') as f:
            json.dump(annot, f, indent=4)
        run_feat(feats, bndl_basnam.with_suffix('.wav'))

    make_archive(dir, dir.with_suffix('.zip'))
    shutil.rmtree(dir)
    return dir.with_suffix('.zip')
Exemple #3
0
def normalize(task: Dict[str, any]) -> Path:
    file = work_dir / task['input']
    with NamedTemporaryFile(dir=work_dir, suffix='.txt', delete=False) as fout:
        output = Path(fout.name)
        logger.info(f'Normalizing text file {file} -> {fout.name}')
        with open(str(work_dir / file)) as fin:
            for line in fin:
                line = line.lower()
                line = pat.sub(' ', line)
                line = num.sub(' ', line)
                line = ws.sub(' ', line)
                fout.write(line)
    return output.relative_to(work_dir)
Exemple #4
0
def ffmpeg(task: Dict[str, any]) -> Path:
    file = work_dir / task['input']
    with NamedTemporaryFile(dir=work_dir, suffix='.wav') as f:
        tmp = Path(f.name)

    cmd = ['ffmpeg', '-y', '-i', dir / file, '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16k', str(tmp)]
    logger.info(u'Running {}'.format(' '.join(cmd)))
    try:
        with open(str(tmp) + '_ffmpeg.log', 'w') as f:
            run(cmd, stdout=f, stderr=STDOUT, check=True)
    except:
        raise RuntimeError('error in call cmd -- check ' + str(tmp) + '_ffmpeg.log')

    if tmp.exists():
        return tmp.relative_to(work_dir)
    else:
        raise RuntimeError('error in ffmpeg (no output file) -- check ' + str(tmp) + '_ffmpeg.log')
Exemple #5
0
def run():
    while True:
        from pymongo import MongoClient

        db = MongoClient(host=db_host)[db_name]
        # if 'tasks' not in db.workers.list_collection_names():
        #     db.create_collection('tasks', capped=True, max=max_task_history,
        #                                  size=max_task_history * ave_task_size)

        logger.info('Worker queue waiting...')

        while True:

            sleep(1)

            task_data = db.tasks.find_one_and_update(
                filter={'$and': [{
                    'in_progress': False
                }, {
                    'done': False
                }]},
                update={'$set': {
                    'in_progress': True
                }},
                sort=[('time', ASCENDING)])

            if not task_data:
                continue

            task_type = task_data['task']
            logger.info(f'Performing {task_type}...')

            set = {'done': True, 'in_progress': False}
            if task_type in tasks_map:
                run = tasks_map[task_type]
                try:
                    result = run(task_data)
                    set['result'] = str(result)
                except RuntimeError as e:
                    set['error'] = str(e)
            else:
                logger.error(f'Unknown task: {task_type}')
                set['error'] = f'Unknown task: {task_type}'

            db.tasks.update_one({'_id': ObjectId(task_data['_id'])},
                                {'$set': set})