def test_less_than_3(self): entry1 = Entry(entry='1', part_of_speech='2', language='kl', definitions=['3']) entry2 = Entry(entry='1', part_of_speech='3', language='kl', definitions=['4']) self.assertLess(entry1, entry2)
def test_less_than_2(self): entry1 = Entry(entry='a', part_of_speech='i', language='kk', definitions=['3']) entry2 = Entry(entry='b', part_of_speech='i', language='kk', definitions=['3']) self.assertLess(entry1, entry2)
def setUp(self) -> None: self.entry1 = Entry(entry='test1', language='l1', part_of_speech='ana', definitions=['def1-1', 'def2-1']) self.entry2 = Entry(entry='test2', language='l2', part_of_speech='ana', definitions=['def2-2', 'def2-2']) self.entry3 = Entry(entry='test3', language='l3', part_of_speech='ana', definitions=['def3-3', 'def2-3'])
def translate(self, entry: Entry, source_language: str, target_language: str = 'mg') -> Entry: out_definitions = [] for definition in entry.definitions: out_entry_dict = entry.serialise() for method in self.methods: extracted_definition = method(entry.part_of_speech, definition, source_language, target_language) if isinstance(extracted_definition, TranslatedDefinition): out_definitions.append(extracted_definition) out_entry_dict['definitions'] = out_definitions return Entry(**out_entry_dict)
def process_entry_in_foreign_language(self, entry: Entry, title: str, language: str, unknowns: list): if entry.language in self.language_blacklist: log.debug( "language '%s' is blacklisted, so not translating or processing." % language) return try: log.debug("Translating word in foreign language (%s in '%s')" % (entry.definitions[0], language)) target_language_translations = [] for translation in self.translate_word(entry.definitions[0], language): if translation['part_of_speech'] == entry.part_of_speech: target_language_translations.append( translation['definition']) if len(target_language_translations) == 0: log.debug("No matching translations found") return except NoWordException: log.debug("No translation found") if title not in unknowns: unknowns.append((entry.definitions[0], language)) return infos = Entry(entry=title, part_of_speech=str(entry.part_of_speech), definitions=target_language_translations, language=entry.language, origin_wiktionary_edition=language, origin_wiktionary_page_name=entry.definitions[0]) return infos
def run(self, word_additional_data_info, counter=0): translation = Translation() translation.output.wikipage_renderer.pages_to_link = self.malagasy_words_to_link word = word_additional_data_info['word'] word_id = word_additional_data_info['word_id'] part_of_speech = word_additional_data_info['part_of_speech'] definitions = self.get_definitions(word_id) translated_definitions = self.get_translated_definitions(word_id) print(f'{word} ({word_id}) ->', definitions, translated_definitions) entry = Entry(entry=word, part_of_speech=part_of_speech, language=self.language, definitions=translated_definitions) if not translated_definitions: return if translated_definitions and translated_definitions[0] == '.': return response = pywikibot.input( f'Entry # {counter + 1}: Accept and upload? (y/n)') if response.strip() == 'y': translation.publish_to_wiktionary(entry.entry, [entry]) translation._save_translation_from_page([entry]) self.mark_definition(word_id, 'done') elif response.strip() == 'n': self.mark_definition(word_id, 'rejected') else: return
def run_from_csv(self, csv_path, language='la'): with open(csv_path, 'r') as csv_file: reader = csv.reader(csv_file, delimiter=';') for row in reader: title, pos, en_defn, mg_defn = row[:4] if not mg_defn.strip(): continue pos = pos.strip() mg_defn = mg_defn[0].upper() + mg_defn[1:].lower() print('>>>>> ' + title + ' <<<<<') try: entry_data = { 'entry': title, 'language': language, 'part_of_speech': pos, 'definitions': [mg_defn], } entry = Entry(**{**entry_data}) wiki_string = self.renderer.render(entry) summary_if_new = wiki_string.replace('\n', ' ') summary_if_already_exists = '/* {{=' + language + '=}} */' summary_if_new = "Pejy voaforona amin'ny « " + summary_if_new + ' »' print(entry) self.publisher.publish(entry, title, wiki_string, summary_if_already_exists, summary_if_new) except SkippedWord: continue else: self.output.db(entry)
def test_to_tuple(self): entry = Entry(entry='1', part_of_speech='2', definitions=['3'], language='fr') self.assertEqual(entry.entry, '1') self.assertEqual(entry.part_of_speech, '2') self.assertEqual(entry.definitions, ['3'])
def test_deep_copy(self): old = Entry(entry='tasumaki', part_of_speech='2', language='mg', definitions=['3']) new = copy.deepcopy(old) new.entry = 'wrong' new.definitions = ['potomaki'] self.assertNotEqual(new.entry, old.entry) self.assertNotEqual(new.definitions, old.definitions)
def get_all_entries(self, keepNativeEntries=False, **kw): """languges sections in a given page formatting: [(POS, lang, definition), ...]""" items = [] if self.content is None: raise Exception( "self.page tsy voafaritra. self.process() tsy mbola nantsoina") ct_content = self.content for lang in re.findall('{{S\\|([a-z]+)\\|([a-z]{2,3})', self.content): # print(ct_content) # word DEFINITION Retrieving d1 = ct_content.find("{{S|%s|%s" % lang) d2 = ct_content.find("=={{langue|", d1) + 1 if not d2: d2 = ct_content.find("== {{langue|", d1 + 50) + 1 d_ptr = ct_content.find("=={{langue|%s" % lang[1], d1) + 1 if not d_ptr: d_ptr = ct_content.find("== {{langue|%s" % lang[1], d1) + 1 if d2 > d1: definition = ct_content[d1:d2] else: definition = ct_content[d1:] try: definition = definition.split('\n# ')[1] definition = re.sub("\\[\\[(.*)#(.*)\\|?[.*]?\\]?\\]?", "\\1", definition) except IndexError: ct_content = ct_content[d_ptr:] continue ct_content = ct_content[d_ptr:] if definition.find('\n') + 1: definition = definition[:definition.find('\n')] definition = stripwikitext(definition) if not definition: ct_content = ct_content[d_ptr:] continue pos = frpos = lang[0].strip() # POS if frpos in self.postran: pos = self.postran[frpos] i = Entry(entry=self.title, part_of_speech=pos, language=lang[1].strip(), definitions=[definition.strip()]) items.append(i) # print("Nahitana dikanteny ", len(items)) return items
def process_entry_in_native_language(self, content: str, title: str, language: str, unknowns: list): """ Yields each translation found :param content: :param title: :param language: :param unknowns: :return: """ wiktionary_processor_class = entryprocessor.WiktionaryProcessorFactory.create( language) wiktionary_processor = wiktionary_processor_class() try: wiktionary_processor.set_text(content) wiktionary_processor.set_title(title) translations = wiktionary_processor.retrieve_translations() except Exception as exc: log.exception(exc) return for translation in translations: entry = translation.entry pos = translation.part_of_speech entry_language = translation.language if entry_language in self.language_blacklist: # check in language blacklist continue try: target_language_translations = [ t['definition'] for t in self.translate_word(title, language) if t['part_of_speech'] == str(pos) ] except NoWordException as exc: log.debug('No translation found for %s in %s' % (title, language)) if title not in unknowns: unknowns.append((title, language)) break infos = Entry(entry=entry, part_of_speech=str(pos), definitions=target_language_translations, language=entry_language, origin_wiktionary_edition=language, origin_wiktionary_page_name=title) yield infos
def render(self, info: Entry, link=True) -> str: data = info.serialise() s = """ {{-%(language)s-}} '''{{subst:BASEPAGENAME}}'''""" % data if link: s += "\n# %s" % ', '.join( ['[[%s]]' % (d) for d in info.definitions]) else: s += "\n# %s" % ', '.join(['%s' % (d) for d in info.definitions]) additional_note = '\n{{bot-made translation|%s}}' % info.origin_wiktionary_page_name s = s + additional_note try: return s except UnicodeDecodeError: return s.decode('utf8')
def get_all_entries(self, keep_native_entries=False, **kw): items = [] if self.content is None: return [] for regex in [self.form_of_regex, self.lemma_regex]: for pos, lang in re.findall(regex, self.content): pos = pos.strip() if pos.strip() in ('etim'): continue # word DEFINITION Retrieving d1 = self.content.find("{{-%s-|%s}}" % (pos, lang)) + len("{{-%s-|%s}}" % (pos, lang)) d2 = self.content.find("=={{=", d1) + 1 or self.content.find( "== {{=", d1) + 1 if d2: definition = self.content[d1:d2] else: definition = self.content[d1:] try: definitions = definition.split('\n# ')[1:] except IndexError: # print(" Hadisoana : Tsy nahitana famaritana") continue entry_definition = [] for definition in definitions: if definition.find('\n') + 1: definition = definition[:definition.find('\n')] definition = re.sub("\\[\\[(.*)#(.*)\\|?\\]?\\]?", "\\1", definition) definition = stripwikitext(definition) if not definition: continue else: entry_definition.append(definition) entry_definition = [d for d in entry_definition if len(d) > 1] if entry_definition: i = Entry(entry=self.title, part_of_speech=pos.strip(), language=lang.strip(), definitions=entry_definition) items.append(i) # print("Nahitana dikanteny ", len(items) ", len(items)) return items
def worker(self, entry: Entry): """ Updates the wiki page with the given entry. If entry exists in database, skip; else, check language's existence on-wiki and if it exists, skip; else, add the entry on-wiki :param entry: entry to create :return: """ if entry.language in LANGUAGE_BLACKLIST: print('blackisted: ', entry.language) return if self.lookup_cache.lookup(entry): return else: pprint(entry) output = Output() output.db(entry) if not self.update_on_wiki: print('not updating on wiki') return print('attempts to update on wiki...') wikipage = output.wikipage(entry) if entry.language in CYRILLIC_ALPHABET_LANGUAGES: entry.entry = _get_unaccented_word(entry.entry) page = pywikibot.Page(self.site, entry.entry) try: if page.isRedirectPage(): return except Exception: return if page.exists(): content = page.get() if '{{=%s=}}' % entry.language in content: print('exists on-wiki') return else: content = wikipage + '\n' + content else: content = wikipage page.put(content, self.summary)
def lookup(self, word) -> Entry: content = self.load_page(word) definitions = [ self.reprocess_definition(d) for d in content.xpath(self.definition_xpath) ] if self.pos_xpath is not None: pos = content.xpath(self.pos_xpath)[0].text.strip('\n') else: pos = 'ana' return Entry( entry=word, part_of_speech=pos, language=self.language, definitions=definitions, )
def do_import(self, workers=100): input_database = DictionaryDatabaseManager( database_file=self.export_path) with input_database.engine.connect() as connection: query = connection.execute(""" select word.id, word.word, word.language, word.part_of_speech, definitions.definition, definitions.definition_language from dictionary, word, definitions where dictionary.definition = definitions.id and word.id = dictionary.word and definition_language = 'mg' """) print('-- build tree --') for w in query.fetchall(): word, language, part_of_speech, definition = w[1], w[2], w[ 3], w[4] key = (word, language, part_of_speech) if key in self.fast_tree: self.fast_tree[key].append(definition) else: self.fast_tree[key] = [definition] print('-- using tree --') for word, language, part_of_speech in self.fast_tree: entry = Entry(entry=word, language=language, part_of_speech=part_of_speech, definitions=self.fast_tree[(word, language, part_of_speech)]) try: self.worker(entry) except Exception: continue
def dictionary_service_update_database(self, info: Entry): """updates database""" # Adapt to expected format log.info(info.serialise()) definitions = [{ 'definition': d, 'definition_language': self.content_language } for d in info.definitions] data = { 'definitions': definitions, 'word': info.entry, 'part_of_speech': info.part_of_speech, 'translation_method': info.translation_method if hasattr(info, 'translation_method') else None } response = dictionary_service.post('entry/%s/create' % info.language, json=data) if response.status_code == WordAlreadyExists.status_code: word_response = dictionary_service.get( 'entry/%s/%s' % (info.language, info.entry)).json() # fetch its ID edit_response = dictionary_service.put( 'entry/%d/edit' % word_response[0]['id'], json=data) # edit using its ID if edit_response.status_code == WordAlreadyExists.status_code: log.debug( '%s [%s] > Attempted to create an already-existing entry.' % (info.entry, info.language)) elif edit_response.status_code != 200: log.error( '%s [%s] > Entry update failed (%d).' % (info.entry, info.language, edit_response.status_code))
def batchfile(self, info: Entry): "return batch format (see doc)" string = "%(entry)s -> %(entry_definition)s -> %(part_of_speech)s -> %(language)s\n" % info.serialise( ) return string
def generate_wikipage_and_summaries(self, translation): # Fetching base information json_dictionary_infos_params = { 'id': 'eq.' + str(translation["word_id"]) } json_dictionary_rq = requests.get(dyn_backend.backend + '/vw_json_dictionary', params=json_dictionary_infos_params) if json_dictionary_rq.status_code == 200: json_dictionary_infos = json_dictionary_rq.json() additional_data = json_dictionary_infos[0]['additional_data'] else: print('json_dictionary_rq.status_code', json_dictionary_rq.status_code) raise SkippedWord() definitions = [] request_convergent_definition_rq = requests.get( dyn_backend.backend + '/convergent_translations', params={'word_id': 'eq.' + str(translation["word_id"])}) if request_convergent_definition_rq.status_code == 200: definitions = [ e['suggested_definition'] for e in request_convergent_definition_rq.json() ] else: print('request_convergent_definition_rq.status_code ', request_convergent_definition_rq.status_code) # Fetching and mapping additional data additional_data_list = json_dictionary_infos[0]['additional_data'] if additional_data_list is not None: # p = self.get_additional_data( # additional_data_list, translation['word_id'], 'pronunciation', # list) raw_additional_data_dict = { 'synonyms': self.get_additional_data(additional_data_list, translation['word_id'], 'synonym', list), 'antonyms': self.get_additional_data(additional_data_list, translation['word_id'], 'antonym', list), 'ipa': self.get_additional_data(additional_data_list, translation['word_id'], 'ipa', list), # 'pronunciation': p[0] if p else [], # 'ipa': ['{{fanononana-ko}}'], 'audio_pronunciations': self.get_additional_data(additional_data_list, translation['word_id'], 'audio', list), 'related_terms': self.get_additional_data(additional_data_list, translation['word_id'], 'related', list), 'derived_terms': self.get_additional_data(additional_data_list, translation['word_id'], 'derived', list), # 'references': ['{{Tsiahy:vortaro.net}}'], # 'references': self.get_additional_data( # additional_data_list, translation['word_id'], 'reference', list), # 'etymology': self.get_additional_data( # additional_data_list, translation['word_id'], 'etym/en', str) } additional_data_dict = { k: v for k, v in raw_additional_data_dict.items() if v } print(raw_additional_data_dict) else: additional_data_dict = {} # Compiling final object if definitions: entry_data = { 'entry': translation["word"], 'language': translation["language"], 'part_of_speech': translation["part_of_speech"], 'definitions': definitions, } for data_type in self.additional_data_types: if data_type in additional_data: entry_data[data_type] = additional_data[data_type] entry = Entry(**{**entry_data, **additional_data_dict}) wiki_string = self.renderer.render(entry) summary_if_new = wiki_string.replace('\n', ' ') summary_if_already_exists = '/* {{=' + \ translation["language"] + '=}} */' if len(summary_if_new) > 147: summary_if_new = summary_if_new[:147] + '...' return entry, wiki_string, summary_if_new, summary_if_already_exists else: print('definitions', definitions) raise SkippedWord()
def create_missing_entries(self, xml_buffer: str): title_node, content_node = self.base_worker(xml_buffer) assert title_node is not None if ':' in title_node: return if self.processor_class is None: self.processor_class = WiktionaryProcessorFactory.create('en') assert self.processor_class is not None processor = self.processor_class() processor.set_title(title_node) processor.set_text(content_node) entries = processor.get_all_entries() for entry in entries: if entry.language == self.language: if self.translation_lookup_table.lookup(entry): translation = self.translation_lookup_table.translate( entry) new_entry = Entry(entry=entry.entry, definitions=translation, language=entry.language, part_of_speech=entry.part_of_speech) #print('local >', new_entry) self.entry_writer.add(new_entry) for e in processor.retrieve_translations(): e.definitions = translation self.entry_writer.add(e) #print('local translation >', e) else: # RIP cyclomatic complexity. translations = [] pos = entry.part_of_speech for definition in entry.definitions: try: translation = self.translation_lookup_table.translate_word( definition, language, entry.part_of_speech) except LookupError: # Translation couldn't be found in lookup table if entry.part_of_speech in TEMPLATE_TO_OBJECT: try: # Try inflection template parser elements = templates_parser.get_elements( TEMPLATE_TO_OBJECT[entry.part_of_speech], definition) except Exception: # add to missing translations self.missing_translation_writer.add(definition) else: if elements: # part of speech changes to become a # form-of part of speech if not pos.startswith('e-'): pos = 'e-' + pos translations.append( elements.to_malagasy_definition()) else: translations.append(translation[0]) if translations: new_entry = Entry(entry=entry.entry, definitions=list(set(translations)), language=entry.language, part_of_speech=pos) #print('foreign >', new_entry) self.entry_writer.add(new_entry)
def create_non_lemma_entry(entry: Entry): word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.definitions[ 0] page_output = Output() mg_page = pywikibot.Page(pywikibot.Site(SITELANG, SITENAME), word) # Translate template's content into malagasy try: if pos not in TEMPLATE_TO_OBJECT: # unsupported template print("Unsupported template") return 0 output_object_class = TEMPLATE_TO_OBJECT[pos] try: elements = templates_parser.get_elements( output_object_class, definition) except Exception: return 1 if code in POST_PROCESSORS: elements = POST_PROCESSORS[code](elements) if elements is None: print("No elements") return 0 malagasy_definition = elements.to_malagasy_definition() lemma = elements.lemma # lemma = get_lemma(output_object_class, definition) print(elements, malagasy_definition, lemma) except ParserError as exc: print(exc) return 0 # Do not create page if lemma does not exist if lemma: mg_lemma_page = pywikibot.Page( pywikibot.Site(SITELANG, SITENAME), lemma) else: return 1 try: if not mg_lemma_page.exists(): print('No lemma (%s) :/' % lemma) return 0 else: broken_redirect = False while mg_lemma_page.isRedirectPage(): mg_lemma_page = mg_lemma_page.getRedirectTarget() if not mg_lemma_page.exists(): broken_redirect = True break if not broken_redirect: content = mg_lemma_page.get() if '{{=' + language_code + '=}}' not in content: print('No lemma (%s) :/' % lemma) return 0 else: print('No lemma : broken redirect (%s)' % lemma) return 0 except pywikibot.exceptions.InvalidTitle: # doing something wrong at this point return 0 except Exception as e: return 1 form_of_template = FORM_OF_TEMPLATE[pos] if pos in FORM_OF_TEMPLATE else pos mg_entry = Entry( entry=word, part_of_speech=form_of_template, definitions=[malagasy_definition], language=code, ) # Check ability to overwrite page if not os.path.isfile('/tmp/%s' % code): # overwrite existing content! overwrite = False else: overwrite = True print( ('PAGE OVERWRITING IS ACTIVE. DELETE /tmp/%s TO DISABLE IT MID-SCRIPT.' % code)) # Create or update the generated page if mg_page.exists() and not overwrite: new_entry = page_output.wikipage(mg_entry, link=False) page_content = mg_page.get() if page_content.find('{{=%s=}}' % code) != -1: if page_content.find( '{{-%s-|%s}}' % (form_of_template, code)) != -1: print('section already exists : No need to go further') return 0 else: # Add part of speech subsection page_content = re.sub( r'==[ ]?{{=%s=}}[ ]?==' % code, new_entry, page_content) else: # Add language section page_content = new_entry + '\n' + page_content else: # Create a new page. page_content = page_output.wikipage(mg_entry, link=False) pywikibot.output('\03{blue}%s\03{default}' % page_content) try: mg_page.put(page_content, f'endriky ny teny [[{lemma}]]') except Exception: pass return 1
def get_all_entries( self, keepNativeEntries=False, get_additional_data=False, cleanup_definitions=False, translate_definitions_to_malagasy=False, human_readable_form_of_definition=True, **kw) -> list: """ Retrieves all necessary information in the form of a list of Entry objects :param keepNativeEntries: :param get_additional_data: :param cleanup_definitions: :param translate_definitions_to_malagasy: :param human_readable_form_of_definition: :param kw: :return: """ content = self.content entries = [] content = re.sub("{{l/en\\|(.*)}}", "\\1 ", content) # remove {{l/en}} for l in re.findall("[\n]?==[ ]?([A-Za-z]+)[ ]?==\n", content): last_part_of_speech = None ct_content = content try: last_language_code = self.lang2code(l) except KeyError: continue definitions = {} section_init = ct_content.find('==%s==' % l) section_end = ct_content.find('----', section_init) if section_end != -1: ct_content = ct_content[section_init:section_end] else: ct_content = ct_content[section_init:] lines = ct_content.split('\n') for line in lines: if last_part_of_speech is None: last_part_of_speech = self.get_part_of_speech(line) # We assume en.wikt definitions start with a "# " and proceed to extract all definitions from there. # Definitions are then added as a list of strings then added as a list of strings. They are grouped # by part of speech to ensure correctness, as we can only have one part of speech for a given entry. if line.startswith('# '): defn_line = line defn_line = defn_line.lstrip('# ') if last_part_of_speech is None: continue definition = self.extract_definition( last_part_of_speech, defn_line, cleanup_definition=cleanup_definitions, translate_definitions_to_malagasy=translate_definitions_to_malagasy, human_readable_form_of_definition=human_readable_form_of_definition, advanced=kw['advanced'] if 'advanced' in kw else False ) if last_part_of_speech in definitions: definitions[last_part_of_speech].append(definition) else: definitions[last_part_of_speech] = [definition] # Fetch additional data if flag is set, else put it to none if get_additional_data: additional_data = self.get_additional_data( ct_content, last_language_code) else: additional_data = None # Create the Entry object to add to the list for pos, definitions in definitions.items(): entry = Entry( entry=self.title, part_of_speech=pos, language=last_language_code, definitions=definitions, ) if additional_data is not None and get_additional_data: entry.additional_data = {} for data_type, data in additional_data.items(): if data: entry.additional_data[data_type] = data entries.append(entry) return entries