コード例 #1
0
ファイル: mode_search.py プロジェクト: akosiaris/apertium-apy
def search_path(rootpath, include_pairs=True, verbosity=1):
    lang_code = r'[a-z]{2,3}(?:_[A-Za-z0-9]+)?'
    type_re = {
        'pair':
        re.compile(r'({0})-({0})\.mode'.format(lang_code)),
        'analyzer':
        re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
        'generator':
        re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
        'tagger':
        re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)),
        'spell':
        re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)),
        'tokenise':
        re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code)),
    }
    modes = {
        'pair': [],
        'analyzer': [],
        'generator': [],
        'tagger': [],
        'spell': [],
        'tokenise': [],
    }  # type: Dict[str, List[Tuple[str, str, str]]]

    real_root = os.path.abspath(os.path.realpath(rootpath))

    for dirpath, dirnames, files in os.walk(rootpath, followlinks=True):
        if is_loop(dirpath, rootpath, real_root):
            dirnames[:] = []
            continue
        for filename in [f for f in files if f.endswith('.mode')]:
            for mtype, regex in type_re.items():
                m = regex.match(filename)
                if m:
                    if mtype != 'pair':
                        modename = m.group(1)  # e.g. en-es-anmorph
                        langlist = [
                            to_alpha3_code(x) for x in m.group(2).split('-')
                        ]
                        lang_pair = '-'.join(langlist)  # e.g. en-es
                        dir_of_modes = os.path.dirname(dirpath)
                        mode = (dir_of_modes, modename, lang_pair)
                        modes[mtype].append(mode)
                    elif include_pairs:
                        lang_src = m.group(1)
                        lang_trg = m.group(2)
                        mode = (os.path.join(dirpath, filename),
                                to_alpha3_code(lang_src),
                                to_alpha3_code(lang_trg))
                        modes[mtype].append(mode)

    if verbosity > 1:
        _log_modes(modes)

    return modes
def search_path(rootpath, include_pairs=True, verbosity=1):
    lang_code = r'[a-z]{2,3}(?:_[A-Za-z]+)?'
    type_re = {
        'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)),
        'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
        'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
        'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)),
        'spell': re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)),
        'tokenise': re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code)),
    }
    modes = {
        'pair': [],
        'analyzer': [],
        'generator': [],
        'tagger': [],
        'spell': [],
        'tokenise': [],
    }  # type: Dict[str, List[Tuple[str, str, str]]]

    real_root = os.path.abspath(os.path.realpath(rootpath))

    for dirpath, dirnames, files in os.walk(rootpath, followlinks=True):
        if is_loop(dirpath, rootpath, real_root):
            dirnames[:] = []
            continue
        for filename in [f for f in files if f.endswith('.mode')]:
            for mtype, regex in type_re.items():
                m = regex.match(filename)
                if m:
                    if mtype != 'pair':
                        modename = m.group(1)  # e.g. en-es-anmorph
                        langlist = [to_alpha3_code(l) for l in m.group(2).split('-')]
                        lang_pair = '-'.join(langlist)  # e.g. en-es
                        dir_of_modes = os.path.dirname(dirpath)
                        mode = (dir_of_modes,
                                modename,
                                lang_pair)
                        modes[mtype].append(mode)
                    elif include_pairs:
                        lang_src = m.group(1)
                        lang_trg = m.group(2)
                        mode = (os.path.join(dirpath, filename),
                                to_alpha3_code(lang_src),
                                to_alpha3_code(lang_trg))
                        modes[mtype].append(mode)

    if verbosity > 1:
        _log_modes(modes)

    return modes
コード例 #3
0
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cld_results = cld2.detect(text)
            if cld_results[0]:
                possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
                self.send_response({
                    to_alpha3_code(possible_lang[1]): possible_lang[2]
                    for possible_lang in possible_langs
                })
            else:
                self.send_response({'nob': 100
                                    })  # TODO: Some more reasonable response
        else:
            try:
                coverages = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverages(text, self.analyzers, penalize=True),
                )
                self.send_response(coverages)

            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')
コード例 #4
0
 def get_pairs_or_error(self, langpairs, text_length):
     langs = [to_alpha3_code(lang) for lang in langpairs.split('|')]
     if len(langs) < 2:
         self.send_error(
             400,
             explanation='Need at least two languages, use e.g. eng|spa')
         self.log_after_translation(self.log_before_translation(),
                                    text_length)
         return None
     if len(langs) == 2:
         if langs[0] == langs[1]:
             self.send_error(
                 400,
                 explanation='Need at least two languages, use e.g. eng|spa'
             )
             self.log_after_translation(self.log_before_translation(),
                                        text_length)
             return None
         return self.paths.get(langs[0], {}).get(langs[1])
     for lang1, lang2 in self.pair_list(langs):
         if '{:s}-{:s}'.format(lang1, lang2) not in self.pairs:
             self.send_error(
                 400,
                 explanation='Pair {:s}-{:s} is not installed'.format(
                     lang1, lang2))
             self.log_after_translation(self.log_before_translation(),
                                        text_length)
             return None
     return langs
コード例 #5
0
ファイル: speller.py プロジェクト: kartikm/apertium-apy
    def get(self):
        in_text = self.get_argument('q') + '*'
        in_mode = to_alpha3_code(self.get_argument('lang'))
        if '-' in in_mode:
            l1, l2 = map(to_alpha3_code, in_mode.split('-', 1))
            in_mode = '%s-%s' % (l1, l2)
        in_mode = self.find_fallback_mode(in_mode, self.spellers)
        logging.info(in_text)
        logging.info(self.get_argument('lang'))
        logging.info(in_mode)
        logging.info(self.spellers)
        if in_mode in self.spellers:
            logging.info(self.spellers[in_mode])
            [path, mode] = self.spellers[in_mode]
            logging.info(path)
            logging.info(mode)
            formatting = 'none'
            commands = [[
                'apertium', '-d', path, '-f', formatting,
                self.get_argument('lang') + '-tokenise'
            ]]
            result = yield translate_simple(in_text, commands)

            tokens = streamparser.parse(result)
            units = []
            for token in tokens:
                if token.knownness == streamparser.known:
                    units.append({
                        'token': token.wordform,
                        'known': True,
                        'sugg': []
                    })
                else:
                    suggestion = []
                    commands = [[
                        'apertium', '-d', path, '-f', formatting, mode
                    ]]

                    result = yield translate_simple(token.wordform, commands)
                    found_sugg = False
                    for line in result.splitlines():
                        if line.count('Corrections for'):
                            found_sugg = True
                            continue
                        if found_sugg and '\t' in line:
                            s, w = line.split('\t')
                            suggestion.append((s, w))

                    units.append({
                        'token': token.wordform,
                        'known': False,
                        'sugg': suggestion
                    })

            self.send_response(units)
        else:
            error_explanation = '{} on spellchecker mode: {}'.format(
                'Error 404',
                'Spelling mode for ' + in_mode + ' is not installed')
            self.send_error(404, explanation=error_explanation)
コード例 #6
0
    async def get(self):
        lang = to_alpha3_code(self.get_argument('lang'))
        modes = set(self.get_argument('modes').split(' '))
        query = self.get_argument('q')

        if not modes <= {
                'morph', 'biltrans', 'tagger', 'disambig', 'translate'
        }:
            self.send_error(400, explanation='Invalid mode argument')
            return

        def handle_output(output):
            """to_return = {}
            for mode in modes:
                to_return[mode] = outputs[mode]
            for mode in modes:
                to_return[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])}
            for mode in modes:
                to_return[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])]
            for mode in modes:
                to_return[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']}
            self.send_response(to_return)"""

            if output is None:
                self.send_error(400, explanation='No output')
                return
            elif not output:
                self.send_error(408, explanation='Request timed out')
                return
            else:
                outputs, tagger_lexical_units, morph_lexical_units = output

            to_return = []

            for (index, lexical_unit
                 ) in enumerate(tagger_lexical_units if tagger_lexical_units
                                else morph_lexical_units):
                unit_to_return = {}
                unit_to_return['input'] = strip_tags(
                    lexical_unit.split('/')[0])
                for mode in modes:
                    unit_to_return[mode] = outputs[mode][index]
                to_return.append(unit_to_return)

            if self.get_argument('pos', default=None):
                requested_pos = int(self.get_argument('pos')) - 1
                current_pos = 0
                for unit in to_return:
                    input = unit['input']
                    current_pos += len(input.split(' '))
                    if requested_pos < current_pos:
                        self.send_response(unit)
                        return
            else:
                self.send_response(to_return)

        output = await process_per_word(self.analyzers, self.taggers, lang,
                                        modes, query)
        handle_output(output)
コード例 #7
0
 def get(self):
     in_text = self.get_argument('q')
     in_mode = to_alpha3_code(self.get_argument('lang'))
     if in_mode in self.analyzers:
         [path, mode] = self.analyzers[in_mode]
         formatting = 'txt'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         result = yield translate_simple(in_text, commands)
         self.send_response(self.postproc_text(in_text, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
 def get(self):
     in_text = self.get_argument('q')
     in_mode = to_alpha3_code(self.get_argument('lang'))
     if in_mode in self.analyzers:
         [path, mode] = self.analyzers[in_mode]
         formatting = 'txt'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         result = yield translate_simple(in_text, commands)
         self.send_response(self.postproc_text(in_text, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
コード例 #9
0
 def get(self):
     in_text = self.get_argument('q')
     in_mode = to_alpha3_code(self.get_argument('lang'))
     if in_mode in self.generators:
         [path, mode] = self.generators[in_mode]
         formatting = 'none'
         commands = [['apertium', '-d', path, '-f', formatting, mode]]
         lexical_units, to_generate = self.preproc_text(in_text)
         result = yield translate_simple(to_generate, commands)
         self.send_response(self.postproc_text(lexical_units, result))
     else:
         self.send_error(400, explanation='That mode is not installed')
コード例 #10
0
    def get(self):
        mode = to_alpha3_code(self.get_argument('lang'))
        text = self.get_argument('q')
        if not text:
            self.send_error(400, explanation='Missing q argument')
            return

        if mode in self.analyzers:
            try:
                coverage = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverage(text, self.analyzers[mode][0],
                                 self.analyzers[mode][1]),
                )
                self.send_response([coverage])
            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')
        else:
            self.send_error(400, explanation='That mode is not installed')
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if cld2:
            cld_results = cld2.detect(text)
            if cld_results[0]:
                possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
                self.send_response({to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs})
            else:
                self.send_response({'nob': 100})  # TODO: Some more reasonable response
        else:
            try:
                coverages = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverages(text, self.analyzers, penalize=True),
                )
                self.send_response(coverages)

            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')
    def get(self):
        lang = to_alpha3_code(self.get_argument('lang'))
        modes = set(self.get_argument('modes').split(' '))
        query = self.get_argument('q')

        if not modes <= {'morph', 'biltrans', 'tagger', 'disambig', 'translate'}:
            self.send_error(400, explanation='Invalid mode argument')
            return

        def handle_output(output):
            """to_return = {}
            for mode in modes:
                to_return[mode] = outputs[mode]
            for mode in modes:
                to_return[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])}
            for mode in modes:
                to_return[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])]
            for mode in modes:
                to_return[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']}
            self.send_response(to_return)"""

            if output is None:
                self.send_error(400, explanation='No output')
                return
            elif not output:
                self.send_error(408, explanation='Request timed out')
                return
            else:
                outputs, tagger_lexical_units, morph_lexical_units = output

            to_return = []

            for (index, lexical_unit) in enumerate(tagger_lexical_units if tagger_lexical_units else morph_lexical_units):
                unit_to_return = {}
                unit_to_return['input'] = strip_tags(lexical_unit.split('/')[0])
                for mode in modes:
                    unit_to_return[mode] = outputs[mode][index]
                to_return.append(unit_to_return)

            if self.get_argument('pos', default=None):
                requested_pos = int(self.get_argument('pos')) - 1
                current_pos = 0
                for unit in to_return:
                    input = unit['input']
                    current_pos += len(input.split(' '))
                    if requested_pos < current_pos:
                        self.send_response(unit)
                        return
            else:
                self.send_response(to_return)

        pool = Pool(processes=1)
        result = pool.apply_async(process_per_word, (self.analyzers, self.taggers, lang, modes, query))
        pool.close()

        @run_async_thread
        def worker(callback):
            try:
                callback(result.get(timeout=self.timeout))
            except TimeoutError:
                pool.terminate()
                callback(None)

        output = yield gen.Task(worker)
        handle_output(output)
コード例 #13
0
ファイル: per_word.py プロジェクト: shardulc/apertium-apy
    def get(self):
        lang = to_alpha3_code(self.get_argument('lang'))
        modes = set(self.get_argument('modes').split(' '))
        query = self.get_argument('q')

        if not modes <= {
                'morph', 'biltrans', 'tagger', 'disambig', 'translate'
        }:
            self.send_error(400, explanation='Invalid mode argument')
            return

        def handle_output(output):
            """to_return = {}
            for mode in modes:
                to_return[mode] = outputs[mode]
            for mode in modes:
                to_return[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])}
            for mode in modes:
                to_return[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])]
            for mode in modes:
                to_return[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']}
            self.send_response(to_return)"""

            if output is None:
                self.send_error(400, explanation='No output')
                return
            elif not output:
                self.send_error(408, explanation='Request timed out')
                return
            else:
                outputs, tagger_lexical_units, morph_lexical_units = output

            to_return = []

            for (index, lexical_unit
                 ) in enumerate(tagger_lexical_units if tagger_lexical_units
                                else morph_lexical_units):
                unit_to_return = {}
                unit_to_return['input'] = strip_tags(
                    lexical_unit.split('/')[0])
                for mode in modes:
                    unit_to_return[mode] = outputs[mode][index]
                to_return.append(unit_to_return)

            if self.get_argument('pos', default=None):
                requested_pos = int(self.get_argument('pos')) - 1
                current_pos = 0
                for unit in to_return:
                    input = unit['input']
                    current_pos += len(input.split(' '))
                    if requested_pos < current_pos:
                        self.send_response(unit)
                        return
            else:
                self.send_response(to_return)

        pool = Pool(processes=1)
        result = pool.apply_async(
            process_per_word,
            (self.analyzers, self.taggers, lang, modes, query))
        pool.close()

        @run_async_thread
        def worker(callback):
            try:
                callback(result.get(timeout=self.timeout))
            except TimeoutError:
                pool.terminate()
                callback(None)

        output = yield gen.Task(worker)
        handle_output(output)