def advanced_extract_definition(self, part_of_speech, definition_line, cleanup_definition=True, translate_definitions_to_malagasy=False, human_readable_form_of_definition=True ): """ Retrieve definition from the wiki page. :param part_of_speech: targetted part of speech :param definition_line: definition line, should start with a "#" :param cleanup_definition: remove links/templates? :param translate_definitions_to_malagasy: translate to malagasy? (valid for templates) :param human_readable_form_of_definition: put the form-of definition as a sentence :return: """ new_definition_line = definition_line # No cleanup for definition if not cleanup_definition: return definition_line # Clean up non-needed template to improve readability. # In case these templates are needed, integrate your code above this part. for regex, replacement in self.regexesrep: new_definition_line = re.sub( regex, replacement, new_definition_line) # Form-of definitions: they use templates that can be parsed using api.parsers module which is tentatively # being integrated here to provide human-readable output for either English or Malagasy if new_definition_line == '': if human_readable_form_of_definition: try: if part_of_speech in TEMPLATE_TO_OBJECT: elements = templates_parser.get_elements( TEMPLATE_TO_OBJECT[part_of_speech], definition_line) if translate_definitions_to_malagasy: new_definition_line = elements.to_definition('mg') else: new_definition_line = elements.to_definition( self.processor_language) except ParserNotFoundError: new_definition_line = definition_line else: return definition_line # print(definition_line, new_definition_line) return new_definition_line
def translate_form_of_templates(part_of_speech, definition_line, source_language, target_language, **kw) -> [UntranslatedDefinition, TranslatedDefinition]: new_definition_line = definition_line # Clean up non-needed template to improve readability. # In case these templates are needed, integrate your code above this part. for regex, replacement in regexesrep: new_definition_line = re.sub(regex, replacement, new_definition_line) # Form-of definitions: they use templates that can be parsed using api.parsers module # which is tentatively being integrated here to provide human-readable output for # either English or Malagasy if new_definition_line == '': try: if part_of_speech in TEMPLATE_TO_OBJECT: elements = templates_parser.get_elements( TEMPLATE_TO_OBJECT[part_of_speech], definition_line) if 'language' in kw: if kw['language'] in POST_PROCESSORS: elements = POST_PROCESSORS[kw['language']](elements) new_definition_line = FormOfTranslaton(elements.to_definition(target_language)) if hasattr(elements, 'lemma'): setattr(new_definition_line, 'lemma', elements.lemma) if part_of_speech in form_of_part_of_speech_mapper: new_definition_line.part_of_speech = form_of_part_of_speech_mapper[ part_of_speech] else: new_definition_line.part_of_speech = part_of_speech except ParserNotFoundError: new_definition_line = UntranslatedDefinition(definition_line) return new_definition_line
def create_non_lemma_entry(entry: Entry): word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.definitions[ 0] page_output = Output() mg_page = pywikibot.Page(pywikibot.Site(SITELANG, SITENAME), word) # Translate template's content into malagasy try: if pos not in TEMPLATE_TO_OBJECT: # unsupported template print("Unsupported template") return 0 output_object_class = TEMPLATE_TO_OBJECT[pos] try: elements = templates_parser.get_elements( output_object_class, definition) except Exception: return 1 if code in POST_PROCESSORS: elements = POST_PROCESSORS[code](elements) if elements is None: print("No elements") return 0 malagasy_definition = elements.to_malagasy_definition() lemma = elements.lemma # lemma = get_lemma(output_object_class, definition) print(elements, malagasy_definition, lemma) except ParserError as exc: print(exc) return 0 # Do not create page if lemma does not exist if lemma: mg_lemma_page = pywikibot.Page( pywikibot.Site(SITELANG, SITENAME), lemma) else: return 1 try: if not mg_lemma_page.exists(): print('No lemma (%s) :/' % lemma) return 0 else: broken_redirect = False while mg_lemma_page.isRedirectPage(): mg_lemma_page = mg_lemma_page.getRedirectTarget() if not mg_lemma_page.exists(): broken_redirect = True break if not broken_redirect: content = mg_lemma_page.get() if '{{=' + language_code + '=}}' not in content: print('No lemma (%s) :/' % lemma) return 0 else: print('No lemma : broken redirect (%s)' % lemma) return 0 except pywikibot.exceptions.InvalidTitle: # doing something wrong at this point return 0 except Exception as e: return 1 form_of_template = FORM_OF_TEMPLATE[pos] if pos in FORM_OF_TEMPLATE else pos mg_entry = Entry( entry=word, part_of_speech=form_of_template, definitions=[malagasy_definition], language=code, ) # Check ability to overwrite page if not os.path.isfile('/tmp/%s' % code): # overwrite existing content! overwrite = False else: overwrite = True print( ('PAGE OVERWRITING IS ACTIVE. DELETE /tmp/%s TO DISABLE IT MID-SCRIPT.' % code)) # Create or update the generated page if mg_page.exists() and not overwrite: new_entry = page_output.wikipage(mg_entry, link=False) page_content = mg_page.get() if page_content.find('{{=%s=}}' % code) != -1: if page_content.find( '{{-%s-|%s}}' % (form_of_template, code)) != -1: print('section already exists : No need to go further') return 0 else: # Add part of speech subsection page_content = re.sub( r'==[ ]?{{=%s=}}[ ]?==' % code, new_entry, page_content) else: # Add language section page_content = new_entry + '\n' + page_content else: # Create a new page. page_content = page_output.wikipage(mg_entry, link=False) pywikibot.output('\03{blue}%s\03{default}' % page_content) try: mg_page.put(page_content, f'endriky ny teny [[{lemma}]]') except Exception: pass return 1
def import_additional_data(entry: Entry) -> int: word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.definitions[ 0] # Translate template's content into malagasy try: if pos not in TEMPLATE_TO_OBJECT: # unsupported template print("Unsupported template") return 0 output_object_class = TEMPLATE_TO_OBJECT[pos] try: elements = templates_parser.get_elements( output_object_class, definition) except Exception as exc: raise ParserError from exc except ParserError as exc: print(exc) return 0 if code in POST_PROCESSORS: elements = POST_PROCESSORS[code](elements) if elements is None: print("No elements") return 0 malagasy_definition = elements.to_malagasy_definition() lemma = elements.lemma def get_word_id_query(): rq_params = { 'word': 'eq.' + entry.entry, 'language': 'eq.' + entry.language, 'part_of_speech': 'eq.' + template.strip() } print(rq_params) response = requests.get(db_backend.backend + '/word', rq_params) return response.json() def post_new_word(): rq_params = { 'word': entry.entry, 'language': entry.language, 'part_of_speech': template.strip(), 'date_changed': time.strftime("%Y-%m-%d %H:%M:%S") } print(rq_params) response = requests.post(db_backend.backend + '/word', rq_params) if response.status_code >= 400: print(response.json()) raise AdditionalDataImporterError( f'Response on post is unexpected: {response.status_code}') try: query = get_word_id_query() if len(query) > 0: word_id = query[0]['id'] importer.write_additional_data(word_id, lemma) else: post_new_word() query = get_word_id_query() assert len(query) > 0 if len(query) > 0: word_id = query[0]['id'] importer.write_additional_data(word_id, lemma) except (KeyError, AdditionalDataImporterError) as err: print(err) print(elements, malagasy_definition, lemma) return 0
def create_non_lemma_entry(entry: Entry): word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.entry_definition[0] page_output = Output() mg_page = pywikibot.Page(pywikibot.Site(SITELANG, SITENAME), word) # Translate template's content into malagasy try: if pos not in TEMPLATE_TO_OBJECT: # unsupported template print("Unsupported template") return 0 output_object_class = TEMPLATE_TO_OBJECT[pos] try: elements = templates_parser.get_elements(output_object_class, definition) except Exception: return 1 if code in POST_PROCESSORS: elements = POST_PROCESSORS[code](elements) if elements is None: print("No elements") return 0 malagasy_definition = elements.to_malagasy_definition() lemma = elements.lemma # lemma = get_lemma(output_object_class, definition) print(elements, malagasy_definition, lemma) except ParserError as exc: print(exc) return 0 # Do not create page if lemma does not exist if lemma not in PAGE_SET: print('No lemma (%s) :/' % lemma) #return 0 form_of_template = FORM_OF_TEMPLATE[pos] if pos in FORM_OF_TEMPLATE else pos mg_entry = Entry( entry=word, part_of_speech=form_of_template, entry_definition=[malagasy_definition], language=code, ) # Check ability to overwrite page if not os.path.isfile('/tmp/%s' % code): # overwrite existing content! overwrite = False else: overwrite = True print(('PAGE OVERWRITING IS ACTIVE. DELETE /tmp/%s TO DISABLE IT MID-SCRIPT.' % code)) # Create or update the generated page if mg_page.exists() and not overwrite: new_entry = page_output.wikipage(mg_entry, link=False) page_content = mg_page.get() if page_content.find('{{=%s=}}' % code) != -1: if page_content.find('{{-%s-|%s}}' % (form_of_template, code)) != -1: print('section already exists : No need to go further') return 0 else: # Add part of speech subsection page_content = re.sub(r'==[ ]?{{=%s=}}[ ]?==' % code, new_entry, page_content) else: # Add language section page_content = new_entry + '\n' + page_content else: # Create a new page. page_content = page_output.wikipage(mg_entry, link=False) pywikibot.output('\03{blue}%s\03{default}' % page_content) try: mg_page.put(page_content, 'Teny vaovao') except Exception: pass return 1
def create_missing_entries(self, xml_buffer: str): title_node, content_node = self.base_worker(xml_buffer) assert title_node is not None if ':' in title_node: return if self.processor_class is None: self.processor_class = WiktionaryProcessorFactory.create('en') assert self.processor_class is not None processor = self.processor_class() processor.set_title(title_node) processor.set_text(content_node) entries = processor.get_all_entries() for entry in entries: if entry.language == self.language: if self.translation_lookup_table.lookup(entry): translation = self.translation_lookup_table.translate( entry) new_entry = Entry(entry=entry.entry, definitions=translation, language=entry.language, part_of_speech=entry.part_of_speech) #print('local >', new_entry) self.entry_writer.add(new_entry) for e in processor.retrieve_translations(): e.definitions = translation self.entry_writer.add(e) #print('local translation >', e) else: # RIP cyclomatic complexity. translations = [] pos = entry.part_of_speech for definition in entry.definitions: try: translation = self.translation_lookup_table.translate_word( definition, language, entry.part_of_speech) except LookupError: # Translation couldn't be found in lookup table if entry.part_of_speech in TEMPLATE_TO_OBJECT: try: # Try inflection template parser elements = templates_parser.get_elements( TEMPLATE_TO_OBJECT[entry.part_of_speech], definition) except Exception: # add to missing translations self.missing_translation_writer.add(definition) else: if elements: # part of speech changes to become a # form-of part of speech if not pos.startswith('e-'): pos = 'e-' + pos translations.append( elements.to_malagasy_definition()) else: translations.append(translation[0]) if translations: new_entry = Entry(entry=entry.entry, definitions=list(set(translations)), language=entry.language, part_of_speech=pos) #print('foreign >', new_entry) self.entry_writer.add(new_entry)