def main(): if len(sys.argv) < 3: print("Usage: <nlpdoc-path> <output_path> <pair_path> <lang>") exit() nlpdocpath = sys.argv[1] output_path = sys.argv[2] pair_path = sys.argv[3] language = sys.argv[4] counter = 0 apertium.append_pair_path(pair_path) apert = apertium.Analyzer(language) with gzip.open(nlpdocpath, 'rt') as r, gzip.open(output_path, 'at') as wr, \ open("exceptions.{}.txt".format(language), 'w', encoding='utf-8') as ex: for line in r: counter += 1 nlpdoc_dict = json.loads(line) text = nlpdoc_dict['text'] apertium_analyzer = ApertiumAnalyzer(apert, text) lemmas = [] for lemma in apertium_analyzer.compute_lemmas(): # print(lemma) lemmas += lemma if len(lemmas) == 0: ex.write(str(counter) + "\n") else: tokens = [{"start": lemma["start"], "end": lemma["end"]} for lemma in lemmas] nlpdoc_dict["annotations"]['token'] = tokens nlpdoc_dict["annotations"]['lemma'] = lemmas wr.write(json.dumps(nlpdoc_dict) + "\n")
def test_analyzer_en(self): analyzer = apertium.Analyzer('en') lexical_units = analyzer.analyze('cats') lexical_unit = lexical_units[0] self.assertListEqual(lexical_unit.readings, [[SReading(baseform='cat', tags=['n', 'pl'])]]) self.assertEqual(lexical_unit.wordform, 'cats') self.assertEqual(lexical_unit.knownness, known)
def test_uninstalled_mode(self): with self.assertRaises(apertium.ModeNotInstalled): apertium.Analyzer('spa')
def translate(data): analyzer = apertium.Analyzer('en') analyzed = analyzer.analyze(data) # translator = apertium.Translator('en','spa') # translated = translator.translate('cat') return analyzed