def test_empty_readings(self): with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter('always') next(parse('^foo/$')) self.assertEqual(len(caught_warnings), 1) self.assertTrue(issubclass(caught_warnings[0].category, RuntimeWarning)) self.assertIn('Empty readings', str(caught_warnings[0].message))
def test_reading_to_string(self): lexical_units = list(parse(self.s4)) self.assertEqual(len(lexical_units), 1) self.assertEqual( reading_to_string(lexical_units[0].readings[0]), 'decir<vblex><imp><p2><sg>+me<prn><enc><p1><mf><sg>+lo<prn><enc><p3><nt>' )
def get(self): in_text = self.get_argument('q') + '*' in_mode = to_alpha3_code(self.get_argument('lang')) if '-' in in_mode: l1, l2 = map(to_alpha3_code, in_mode.split('-', 1)) in_mode = '%s-%s' % (l1, l2) in_mode = self.find_fallback_mode(in_mode, self.spellers) logging.info(in_text) logging.info(self.get_argument('lang')) logging.info(in_mode) logging.info(self.spellers) if in_mode in self.spellers: logging.info(self.spellers[in_mode]) [path, mode] = self.spellers[in_mode] logging.info(path) logging.info(mode) formatting = 'none' commands = [[ 'apertium', '-d', path, '-f', formatting, self.get_argument('lang') + '-tokenise' ]] result = yield translate_simple(in_text, commands) tokens = streamparser.parse(result) units = [] for token in tokens: if token.knownness == streamparser.known: units.append({ 'token': token.wordform, 'known': True, 'sugg': [] }) else: suggestion = [] commands = [[ 'apertium', '-d', path, '-f', formatting, mode ]] result = yield translate_simple(token.wordform, commands) found_sugg = False for line in result.splitlines(): if line.count('Corrections for'): found_sugg = True continue if found_sugg and '\t' in line: s, w = line.split('\t') suggestion.append((s, w)) units.append({ 'token': token.wordform, 'known': False, 'sugg': suggestion }) self.send_response(units) else: error_explanation = '{} on spellchecker mode: {}'.format( 'Error 404', 'Spelling mode for ' + in_mode + ' is not installed') self.send_error(404, explanation=error_explanation)
def collect(sentence, sentence_tagged): """ Collect a dictionary, which contains positions of sentence words as keys and tokens + tags as items. """ sentence_words = {} tokens = tokenizer(sentence.lower()) sentence_tagged = sentence_tagged.split(' ') for i in range(len(sentence_tagged)): if len(re.findall('\/\*', sentence_tagged[i])) > 1: word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1) word = re.sub('\$', '', word) word = re.sub('\^', '', word) word = '^' + word + '/*' + word + '$' sentence_tagged[i] = word sentence_tagged = ' '.join(sentence_tagged) units = parse(c for c in sentence_tagged) counter = 0 for unit in units: sentence_words[counter] = [ tokens[counter], set(unit.readings[0][0][1]) ] counter += 1 return sentence_words
def collect(sentence, sentence_tagged): sentence_words = {} tokens = tokenizer(sentence.lower()) sentence_tagged = sentence_tagged.split(' ') for i in range(len(sentence_tagged)): if len(re.findall('\/\*', sentence_tagged[i])) > 1: word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1) word = re.sub('\$', '', word) word = re.sub('\^', '', word) word = '^' + word + '/*' + word + '$' sentence_tagged[i] = word sentence_tagged = ' '.join(sentence_tagged) units = parse(c for c in sentence_tagged) counter = 0 try: for unit in units: sentence_words[counter] = [ tokens[counter], set(unit.readings[0][0][1]) ] counter += 1 except: pass return sentence_words
def parse_sf(apertium_string): surface_forms = [] parsed_string = parse(apertium_string) for word in parsed_string: surface_forms.append(word.wordform) return surface_forms
def test_parse(self): lexical_units = list(parse(self.s1)) self.assertEqual(len(lexical_units), 1) lexical_unit = lexical_units[0] self.assertEqual(str(lexical_unit), r'x\/y\^\$\<z\>å/A\$\^B<tag><tag2>/A\/S<tag><#1-\>2>') readings = lexical_unit.readings self.assertListEqual(readings, [[SReading(baseform='A\\$\\^B', tags=['tag', 'tag2'])], [SReading(baseform='A\\/S', tags=['tag', '#1-\\>2'])]]) self.assertEqual(lexical_unit.wordform, r'x\/y\^\$\<z\>å') self.assertEqual(lexical_unit.knownness, known)
def test_blanks_with_wordbound_blanks(self): lexical_units_with_blanks = list(parse(self.s5, with_text=True)) self.assertEqual(len(lexical_units_with_blanks), 3) blank, _lexical_unit = lexical_units_with_blanks[0] self.assertEqual(blank, r'[] ') blank, _lexical_unit = lexical_units_with_blanks[1] self.assertEqual(blank, r' [bl] ') blank, _lexical_unit = lexical_units_with_blanks[2] self.assertEqual(blank, r' [\[] [\]blank] ')
def stream_parser_split_X_y(file_lines): lexical_units = parse('\n'.join(file_lines)) X = [] y = [] for lexical_unit in lexical_units: y.append(reading_to_string(lexical_unit.readings[0])) X.append(lexical_unit.wordform) assert(len(y)==len(X)), 'Token and Target vectors size mismatch ({}!={})'.format(len(y), len(X)) return X, y
def stream_parser_extract_analyses(line): unit = [unit for unit in parse(line)] # TODO: Handle cases such as "Empty readings for ///<sent>" in a better way if not unit: return [''] unit = unit[0] # Is the "///<sent>" really handeled? analyses = [reading_to_string(reading) for reading in unit.readings] return analyses if len(analyses) else ['']
def _postproc_text(result: str) -> List[LexicalUnit]: """ Postprocesses the input Args: result (str) Returns: List[LexicalUnit] """ lexical_units = list(parse(result)) # type: List[LexicalUnit] return lexical_units
def _postproc_text(self, result): # type: (Analyzer, str) -> List[LexicalUnit] """ Postprocesses the input Args: result (str) Returns: List[LexicalUnit] """ lexical_units = list(parse(result)) return lexical_units
def test_wordbound_blanks(self): lexical_units = list(parse(self.s5)) self.assertEqual(len(lexical_units), 3) self.assertListEqual( lexical_units[2].readings, [ [SReading(baseform='name', tags=['n', 'sg'])], [SReading(baseform='name', tags=['vblex', 'inf'])], [SReading(baseform='name', tags=['vblex', 'pres'])], ], ) self.assertEqual(lexical_units[0].wordform, 'My') self.assertEqual(lexical_units[0].wordbound_blank, '[[t:b:123456]]') self.assertEqual(lexical_units[1].wordform, 'test') self.assertEqual(lexical_units[1].wordbound_blank, '') self.assertEqual(lexical_units[2].wordform, 'name') self.assertEqual(lexical_units[2].wordbound_blank, '[[t:i:12asda; t:p:1abc76]]')
def test_parse_subreadings(self): lexical_units = list(parse(self.s4)) self.assertEqual(len(lexical_units), 1) self.assertListEqual( lexical_units[0].readings, [ [ SReading(baseform='decir', tags=['vblex', 'imp', 'p2', 'sg']), SReading(baseform='me', tags=['prn', 'enc', 'p1', 'mf', 'sg']), SReading(baseform='lo', tags=['prn', 'enc', 'p3', 'nt']), ], [ SReading(baseform='decir', tags=['vblex', 'imp', 'p2', 'sg']), SReading(baseform='me', tags=['prn', 'enc', 'p1', 'mf', 'sg']), SReading(baseform='lo', tags=['prn', 'enc', 'p3', 'm', 'sg']), ], ], )
def test_subreading_to_string(self): lexical_units = list(parse(self.s4)) self.assertEqual(len(lexical_units), 1) self.assertEqual(subreading_to_string(lexical_units[0].readings[0][0]), 'decir<vblex><imp><p2><sg>')
def test_reading_to_string(self): lexical_units = list(parse(self.s4)) self.assertEqual(len(lexical_units), 1) self.assertEqual(reading_to_string(lexical_units[0].readings[0]), 'decir<vblex><imp><p2><sg>+me<prn><enc><p1><mf><sg>+lo<prn><enc><p3><nt>')
def test_mainpos_ltr(self): lexical_units = list(parse(self.s4)) self.assertEqual(len(lexical_units), 1) pos = mainpos(lexical_units[0].readings[0], ltr=True) self.assertEqual(pos, 'vblex')
def test_mainpos(self): lexical_units = list(parse(self.s4)) self.assertEqual(len(lexical_units), 1) pos = mainpos(lexical_units[0].readings[0]) self.assertEqual(pos, 'prn')
def test_parse_with_text(self): lexical_units_with_blanks = list(parse(self.s1, with_text=True)) self.assertEqual(len(lexical_units_with_blanks), 1) blank, _lexical_unit = lexical_units_with_blanks[0] self.assertEqual(blank, r'[\^keep<escapes>\$] \^ \$ \/ \[ \] ')
def test_parse_unknown(self): lexical_units = list(parse(self.s2)) self.assertEqual(len(lexical_units), 1) self.assertEqual(lexical_units[0].knownness, unknown)
def clean_trash(orig_source, orig_mt, orig_target, freqs, s_lang, t_lang, input_file): with open('tagged_source_entries.txt', 'r', encoding='utf-8') as file: source = file.read().strip('\n').split('\n') with open('tagged_mt_entries.txt', 'r', encoding='utf-8') as file: mt = file.read().strip('\n').split('\n') with open('tagged_target_entries.txt', 'r', encoding='utf-8') as file: target = file.read().strip('\n').split('\n') for i in range(len(source)): source[i] = bla(source[i]) for i in range(len(mt)): mt[i] = bla(mt[i]) for i in range(len(target)): target[i] = bla(target[i]) counter = 0 with open(s_lang + '-' + t_lang + '-cleaned_' + input_file, 'w', encoding='utf-8') as file: for s, m, t in zip(source, mt, target): s_units = parse(c for c in s) m_units = parse(c for c in m) t_units = parse(c for c in t) for su, mu, tu in zip(s_units, m_units, t_units): for sr, mr, tr in zip(su.readings, mu.readings, tu.readings): s_lemma = sr[0][0] m_lemma = mr[0][0] t_lemma = tr[0][0] s_tags = sr[0][1] m_tags = mr[0][1] t_tags = tr[0][1] if 'lquot' in t_tags or 'sent' in t_tags or 'rquot' in t_tags or 'cm' in t_tags or 'guio' in t_tags: pass elif 'lquot' in s_tags or 'sent' in s_tags or 'rquot' in s_tags or 'cm' in s_tags or 'guio' in s_tags: pass else: st_dis = distance(s_lemma, t_lemma) st_letters = len(t_lemma) st_percent = (st_letters - st_dis) / st_letters * 100 sm_dis = distance(s_lemma, m_lemma) sm_letters = len(m_lemma) sm_percent = (sm_letters - sm_dis) / sm_letters * 100 mt_dis = distance(m_lemma, t_lemma) mt_letters = len(t_lemma) mt_percent = (mt_letters - mt_dis) / mt_letters * 100 if st_percent >= 30 and sm_percent >= 30 and mt_percent >= 30 and 'other' not in input_file: file.write('%s\t%s\t%s\t%s\n' % (orig_source[counter], orig_mt[counter], orig_target[counter], freqs[counter])) elif 'other' in input_file: file.write('%s\t%s\t%s\t%s\n' % (orig_source[counter], orig_mt[counter], orig_target[counter], freqs[counter])) else: pass counter += 1
else: input_idiom = sys.stdin.read() #n = 1 #Number of words it will replace from streamparser import parse, mainpos, reading_to_string replacement_candidates = [] candidate_tags = ['n', 'adj', 'adv', 'vblex', 'v'] #Parse Input Idiom to get a candidate for replacement lu_count = 0 input_idiom_surface = [] for lu in parse(input_idiom): analyses = lu.readings firstreading = analyses[0] surfaceform = lu.wordform input_idiom_surface.append(surfaceform) #print(firstreading[0].tags) #print("^{}/{}$".format(surfaceform, # reading_to_string(firstreading))) for tag in candidate_tags: if tag in firstreading[0].tags: replacement_candidates.append( [firstreading[0], surfaceform, lu_count])
def malaphor(input_idiom, lang): idioms_list = open(f'{lang}-idioms-analysed.txt', 'r').read().split('\n') replacement_candidates = [] candidate_tags = ['n', 'adj', 'adv', 'vblex', 'v'] #Parse Input Idiom to get a candidate for replacement lu_count = 0 input_idiom_surface = [] for lu in parse(input_idiom): if debug: print(lu) analyses = lu.readings if debug: print(analyses) firstreading = analyses[0] surfaceform = lu.wordform input_idiom_surface.append(surfaceform) if debug: print(firstreading[0].tags) print("^{}/{}$".format(surfaceform, reading_to_string(firstreading))) for tag in candidate_tags: if tag in firstreading[0].tags: replacement_candidates.append( [firstreading[0], surfaceform, lu_count]) lu_count += 1 if debug: print(replacement_candidates) if len(replacement_candidates) < 1: raise NoCandidatesForReplacement() elif len(replacement_candidates) == 1: final_replacement_candidate = replacement_candidates[0] else: final_replacement_candidate = replacement_candidates[random.randint( 0, len(replacement_candidates) - 1)] if debug: print("Final Replacement Candidate:") print(final_replacement_candidate) #Parse Idioms list (analysed through the morph) to find a suitable replacement possible_replacements = [] for idiom in idioms_list: for lu in parse(idiom): if lu.readings[0][0].tags == final_replacement_candidate[0].tags: possible_replacements.append([lu.readings[0][0], lu.wordform]) replacement_flag = 1 if len(possible_replacements) < 1: raise NoReplacement() elif len(possible_replacements) == 1: final_replacement_word = possible_replacements[0] else: final_replacement_word = possible_replacements[random.randint( 0, len(possible_replacements) - 1)] if debug: print("Replacement Word:") print(final_replacement_word) #Make the replacement in the original idiom pos = final_replacement_candidate[ 2] #contains the original position of the word in the input idiom if debug: print(input_idiom_surface) input_idiom_surface[pos] = final_replacement_word[1] if debug: print(input_idiom_surface) out = '' for word in input_idiom_surface: if out != '' and word[0] in string.ascii_letters + string.digits: out += ' ' + word else: out += word return out.capitalize()
###################################################################################################################### print( "\nProcessing main, words_case, words, lemmas, tags, tags_uniq section:" ) ###################################################################################################################### # 1st cycle ###################################################################################################################### # # Collecting words, lemmas, tags... # #for blank, lexicalUnit in parse_file(fr, with_text=True): for line in fr: line = line.strip(" \r\n") #print("LINE:" + str(line)) for blank, lexicalUnit in parse(line, with_text=True): # MySQL (MariaDB) does not differ Russian "ё" and "е" letters #line = re.sub( r'ё', r"е", line ) # ё => е #line = re.sub( r'Ё', r"Е", line ) # Ё => Е #line = re.sub( r'\\', r"", line ) # remove backslashes, otherwise mysql will not import those lines #print("BLANK:" + str(blank) + " lexicalUnit:" + str(lexicalUnit)) #sys.stdout.flush() blank = blank.strip(' \t') if blank != "": status = collect_tokens(blank, blank, blank, blanktag, blanktag) status, word_case, word, lemma, pos, tags = parse_apertium(word_case, word, lemma, pos, tags) status = collect_tokens(word_case, word, lemma, pos, tags)