def search_path(rootpath, include_pairs=True, verbosity=1): lang_code = r'[a-z]{2,3}(?:_[A-Za-z0-9]+)?' type_re = { 'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)), 'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)), 'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)), 'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)), 'spell': re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)), 'tokenise': re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code)), } modes = { 'pair': [], 'analyzer': [], 'generator': [], 'tagger': [], 'spell': [], 'tokenise': [], } # type: Dict[str, List[Tuple[str, str, str]]] real_root = os.path.abspath(os.path.realpath(rootpath)) for dirpath, dirnames, files in os.walk(rootpath, followlinks=True): if is_loop(dirpath, rootpath, real_root): dirnames[:] = [] continue for filename in [f for f in files if f.endswith('.mode')]: for mtype, regex in type_re.items(): m = regex.match(filename) if m: if mtype != 'pair': modename = m.group(1) # e.g. en-es-anmorph langlist = [ to_alpha3_code(x) for x in m.group(2).split('-') ] lang_pair = '-'.join(langlist) # e.g. en-es dir_of_modes = os.path.dirname(dirpath) mode = (dir_of_modes, modename, lang_pair) modes[mtype].append(mode) elif include_pairs: lang_src = m.group(1) lang_trg = m.group(2) mode = (os.path.join(dirpath, filename), to_alpha3_code(lang_src), to_alpha3_code(lang_trg)) modes[mtype].append(mode) if verbosity > 1: _log_modes(modes) return modes
def search_path(rootpath, include_pairs=True, verbosity=1): lang_code = r'[a-z]{2,3}(?:_[A-Za-z]+)?' type_re = { 'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)), 'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)), 'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)), 'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)), 'spell': re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)), 'tokenise': re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code)), } modes = { 'pair': [], 'analyzer': [], 'generator': [], 'tagger': [], 'spell': [], 'tokenise': [], } # type: Dict[str, List[Tuple[str, str, str]]] real_root = os.path.abspath(os.path.realpath(rootpath)) for dirpath, dirnames, files in os.walk(rootpath, followlinks=True): if is_loop(dirpath, rootpath, real_root): dirnames[:] = [] continue for filename in [f for f in files if f.endswith('.mode')]: for mtype, regex in type_re.items(): m = regex.match(filename) if m: if mtype != 'pair': modename = m.group(1) # e.g. en-es-anmorph langlist = [to_alpha3_code(l) for l in m.group(2).split('-')] lang_pair = '-'.join(langlist) # e.g. en-es dir_of_modes = os.path.dirname(dirpath) mode = (dir_of_modes, modename, lang_pair) modes[mtype].append(mode) elif include_pairs: lang_src = m.group(1) lang_trg = m.group(2) mode = (os.path.join(dirpath, filename), to_alpha3_code(lang_src), to_alpha3_code(lang_trg)) modes[mtype].append(mode) if verbosity > 1: _log_modes(modes) return modes
def get(self): text = self.get_argument('q') if not text: return self.send_error(400, explanation='Missing q argument') if cld2: cld_results = cld2.detect(text) if cld_results[0]: possible_langs = filter(lambda x: x[1] != 'un', cld_results[2]) self.send_response({ to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs }) else: self.send_response({'nob': 100 }) # TODO: Some more reasonable response else: try: coverages = yield gen.with_timeout( timedelta(seconds=self.timeout), get_coverages(text, self.analyzers, penalize=True), ) self.send_response(coverages) except gen.TimeoutError: self.send_error(408, explanation='Request timed out')
def get_pairs_or_error(self, langpairs, text_length): langs = [to_alpha3_code(lang) for lang in langpairs.split('|')] if len(langs) < 2: self.send_error( 400, explanation='Need at least two languages, use e.g. eng|spa') self.log_after_translation(self.log_before_translation(), text_length) return None if len(langs) == 2: if langs[0] == langs[1]: self.send_error( 400, explanation='Need at least two languages, use e.g. eng|spa' ) self.log_after_translation(self.log_before_translation(), text_length) return None return self.paths.get(langs[0], {}).get(langs[1]) for lang1, lang2 in self.pair_list(langs): if '{:s}-{:s}'.format(lang1, lang2) not in self.pairs: self.send_error( 400, explanation='Pair {:s}-{:s} is not installed'.format( lang1, lang2)) self.log_after_translation(self.log_before_translation(), text_length) return None return langs
def get(self): in_text = self.get_argument('q') + '*' in_mode = to_alpha3_code(self.get_argument('lang')) if '-' in in_mode: l1, l2 = map(to_alpha3_code, in_mode.split('-', 1)) in_mode = '%s-%s' % (l1, l2) in_mode = self.find_fallback_mode(in_mode, self.spellers) logging.info(in_text) logging.info(self.get_argument('lang')) logging.info(in_mode) logging.info(self.spellers) if in_mode in self.spellers: logging.info(self.spellers[in_mode]) [path, mode] = self.spellers[in_mode] logging.info(path) logging.info(mode) formatting = 'none' commands = [[ 'apertium', '-d', path, '-f', formatting, self.get_argument('lang') + '-tokenise' ]] result = yield translate_simple(in_text, commands) tokens = streamparser.parse(result) units = [] for token in tokens: if token.knownness == streamparser.known: units.append({ 'token': token.wordform, 'known': True, 'sugg': [] }) else: suggestion = [] commands = [[ 'apertium', '-d', path, '-f', formatting, mode ]] result = yield translate_simple(token.wordform, commands) found_sugg = False for line in result.splitlines(): if line.count('Corrections for'): found_sugg = True continue if found_sugg and '\t' in line: s, w = line.split('\t') suggestion.append((s, w)) units.append({ 'token': token.wordform, 'known': False, 'sugg': suggestion }) self.send_response(units) else: error_explanation = '{} on spellchecker mode: {}'.format( 'Error 404', 'Spelling mode for ' + in_mode + ' is not installed') self.send_error(404, explanation=error_explanation)
async def get(self): lang = to_alpha3_code(self.get_argument('lang')) modes = set(self.get_argument('modes').split(' ')) query = self.get_argument('q') if not modes <= { 'morph', 'biltrans', 'tagger', 'disambig', 'translate' }: self.send_error(400, explanation='Invalid mode argument') return def handle_output(output): """to_return = {} for mode in modes: to_return[mode] = outputs[mode] for mode in modes: to_return[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])} for mode in modes: to_return[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])] for mode in modes: to_return[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']} self.send_response(to_return)""" if output is None: self.send_error(400, explanation='No output') return elif not output: self.send_error(408, explanation='Request timed out') return else: outputs, tagger_lexical_units, morph_lexical_units = output to_return = [] for (index, lexical_unit ) in enumerate(tagger_lexical_units if tagger_lexical_units else morph_lexical_units): unit_to_return = {} unit_to_return['input'] = strip_tags( lexical_unit.split('/')[0]) for mode in modes: unit_to_return[mode] = outputs[mode][index] to_return.append(unit_to_return) if self.get_argument('pos', default=None): requested_pos = int(self.get_argument('pos')) - 1 current_pos = 0 for unit in to_return: input = unit['input'] current_pos += len(input.split(' ')) if requested_pos < current_pos: self.send_response(unit) return else: self.send_response(to_return) output = await process_per_word(self.analyzers, self.taggers, lang, modes, query) handle_output(output)
def get(self): in_text = self.get_argument('q') in_mode = to_alpha3_code(self.get_argument('lang')) if in_mode in self.analyzers: [path, mode] = self.analyzers[in_mode] formatting = 'txt' commands = [['apertium', '-d', path, '-f', formatting, mode]] result = yield translate_simple(in_text, commands) self.send_response(self.postproc_text(in_text, result)) else: self.send_error(400, explanation='That mode is not installed')
def get(self): in_text = self.get_argument('q') in_mode = to_alpha3_code(self.get_argument('lang')) if in_mode in self.generators: [path, mode] = self.generators[in_mode] formatting = 'none' commands = [['apertium', '-d', path, '-f', formatting, mode]] lexical_units, to_generate = self.preproc_text(in_text) result = yield translate_simple(to_generate, commands) self.send_response(self.postproc_text(lexical_units, result)) else: self.send_error(400, explanation='That mode is not installed')
def get(self): mode = to_alpha3_code(self.get_argument('lang')) text = self.get_argument('q') if not text: self.send_error(400, explanation='Missing q argument') return if mode in self.analyzers: try: coverage = yield gen.with_timeout( timedelta(seconds=self.timeout), get_coverage(text, self.analyzers[mode][0], self.analyzers[mode][1]), ) self.send_response([coverage]) except gen.TimeoutError: self.send_error(408, explanation='Request timed out') else: self.send_error(400, explanation='That mode is not installed')
def get(self): text = self.get_argument('q') if not text: return self.send_error(400, explanation='Missing q argument') if cld2: cld_results = cld2.detect(text) if cld_results[0]: possible_langs = filter(lambda x: x[1] != 'un', cld_results[2]) self.send_response({to_alpha3_code(possible_lang[1]): possible_lang[2] for possible_lang in possible_langs}) else: self.send_response({'nob': 100}) # TODO: Some more reasonable response else: try: coverages = yield gen.with_timeout( timedelta(seconds=self.timeout), get_coverages(text, self.analyzers, penalize=True), ) self.send_response(coverages) except gen.TimeoutError: self.send_error(408, explanation='Request timed out')
def get(self): lang = to_alpha3_code(self.get_argument('lang')) modes = set(self.get_argument('modes').split(' ')) query = self.get_argument('q') if not modes <= {'morph', 'biltrans', 'tagger', 'disambig', 'translate'}: self.send_error(400, explanation='Invalid mode argument') return def handle_output(output): """to_return = {} for mode in modes: to_return[mode] = outputs[mode] for mode in modes: to_return[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])} for mode in modes: to_return[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])] for mode in modes: to_return[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']} self.send_response(to_return)""" if output is None: self.send_error(400, explanation='No output') return elif not output: self.send_error(408, explanation='Request timed out') return else: outputs, tagger_lexical_units, morph_lexical_units = output to_return = [] for (index, lexical_unit) in enumerate(tagger_lexical_units if tagger_lexical_units else morph_lexical_units): unit_to_return = {} unit_to_return['input'] = strip_tags(lexical_unit.split('/')[0]) for mode in modes: unit_to_return[mode] = outputs[mode][index] to_return.append(unit_to_return) if self.get_argument('pos', default=None): requested_pos = int(self.get_argument('pos')) - 1 current_pos = 0 for unit in to_return: input = unit['input'] current_pos += len(input.split(' ')) if requested_pos < current_pos: self.send_response(unit) return else: self.send_response(to_return) pool = Pool(processes=1) result = pool.apply_async(process_per_word, (self.analyzers, self.taggers, lang, modes, query)) pool.close() @run_async_thread def worker(callback): try: callback(result.get(timeout=self.timeout)) except TimeoutError: pool.terminate() callback(None) output = yield gen.Task(worker) handle_output(output)
def get(self): lang = to_alpha3_code(self.get_argument('lang')) modes = set(self.get_argument('modes').split(' ')) query = self.get_argument('q') if not modes <= { 'morph', 'biltrans', 'tagger', 'disambig', 'translate' }: self.send_error(400, explanation='Invalid mode argument') return def handle_output(output): """to_return = {} for mode in modes: to_return[mode] = outputs[mode] for mode in modes: to_return[mode] = {outputs[mode + '_inputs'][index]: output for (index, output) in enumerate(outputs[mode])} for mode in modes: to_return[mode] = [(outputs[mode + '_inputs'][index], output) for (index, output) in enumerate(outputs[mode])] for mode in modes: to_return[mode] = {'outputs': outputs[mode], 'inputs': outputs[mode + '_inputs']} self.send_response(to_return)""" if output is None: self.send_error(400, explanation='No output') return elif not output: self.send_error(408, explanation='Request timed out') return else: outputs, tagger_lexical_units, morph_lexical_units = output to_return = [] for (index, lexical_unit ) in enumerate(tagger_lexical_units if tagger_lexical_units else morph_lexical_units): unit_to_return = {} unit_to_return['input'] = strip_tags( lexical_unit.split('/')[0]) for mode in modes: unit_to_return[mode] = outputs[mode][index] to_return.append(unit_to_return) if self.get_argument('pos', default=None): requested_pos = int(self.get_argument('pos')) - 1 current_pos = 0 for unit in to_return: input = unit['input'] current_pos += len(input.split(' ')) if requested_pos < current_pos: self.send_response(unit) return else: self.send_response(to_return) pool = Pool(processes=1) result = pool.apply_async( process_per_word, (self.analyzers, self.taggers, lang, modes, query)) pool.close() @run_async_thread def worker(callback): try: callback(result.get(timeout=self.timeout)) except TimeoutError: pool.terminate() callback(None) output = yield gen.Task(worker) handle_output(output)