コード例 #1
0
ファイル: en.py プロジェクト: radomd92/botjagwar
    def advanced_extract_definition(self, part_of_speech, definition_line,
                                    cleanup_definition=True,
                                    translate_definitions_to_malagasy=False,
                                    human_readable_form_of_definition=True
                                    ):
        """
        Retrieve definition from the wiki page.
        :param part_of_speech: targetted part of speech
        :param definition_line: definition line, should start with a "#"
        :param cleanup_definition: remove links/templates?
        :param translate_definitions_to_malagasy: translate to malagasy? (valid for templates)
        :param human_readable_form_of_definition: put the form-of definition as a sentence
        :return:
        """
        new_definition_line = definition_line
        # No cleanup for definition
        if not cleanup_definition:
            return definition_line

        # Clean up non-needed template to improve readability.
        # In case these templates are needed, integrate your code above this part.
        for regex, replacement in self.regexesrep:
            new_definition_line = re.sub(
                regex, replacement, new_definition_line)

        # Form-of definitions: they use templates that can be parsed using api.parsers module which is tentatively
        #   being integrated here to provide human-readable output for either English or Malagasy
        if new_definition_line == '':
            if human_readable_form_of_definition:
                try:
                    if part_of_speech in TEMPLATE_TO_OBJECT:
                        elements = templates_parser.get_elements(
                            TEMPLATE_TO_OBJECT[part_of_speech], definition_line)
                        if translate_definitions_to_malagasy:
                            new_definition_line = elements.to_definition('mg')
                        else:
                            new_definition_line = elements.to_definition(
                                self.processor_language)
                except ParserNotFoundError:
                    new_definition_line = definition_line
        else:
            return definition_line

        # print(definition_line, new_definition_line)
        return new_definition_line
コード例 #2
0
def translate_form_of_templates(part_of_speech,
                                definition_line,
                                source_language,
                                target_language,
                                **kw) -> [UntranslatedDefinition,
                                          TranslatedDefinition]:

    new_definition_line = definition_line

    # Clean up non-needed template to improve readability.
    # In case these templates are needed, integrate your code above this part.
    for regex, replacement in regexesrep:
        new_definition_line = re.sub(regex, replacement, new_definition_line)

    # Form-of definitions: they use templates that can be parsed using api.parsers module
    #   which is tentatively being integrated here to provide human-readable output for
    #   either English or Malagasy
    if new_definition_line == '':
        try:
            if part_of_speech in TEMPLATE_TO_OBJECT:
                elements = templates_parser.get_elements(
                    TEMPLATE_TO_OBJECT[part_of_speech], definition_line)
                if 'language' in kw:
                    if kw['language'] in POST_PROCESSORS:
                        elements = POST_PROCESSORS[kw['language']](elements)
                new_definition_line = FormOfTranslaton(elements.to_definition(target_language))
                if hasattr(elements, 'lemma'):
                    setattr(new_definition_line, 'lemma', elements.lemma)
                if part_of_speech in form_of_part_of_speech_mapper:
                    new_definition_line.part_of_speech = form_of_part_of_speech_mapper[
                        part_of_speech]
                else:
                    new_definition_line.part_of_speech = part_of_speech
        except ParserNotFoundError:
            new_definition_line = UntranslatedDefinition(definition_line)

    return new_definition_line
コード例 #3
0
ファイル: word_forms.py プロジェクト: radomd92/botjagwar
def create_non_lemma_entry(entry: Entry):
    word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.definitions[
        0]
    page_output = Output()
    mg_page = pywikibot.Page(pywikibot.Site(SITELANG, SITENAME), word)

    # Translate template's content into malagasy
    try:
        if pos not in TEMPLATE_TO_OBJECT:  # unsupported template
            print("Unsupported template")
            return 0

        output_object_class = TEMPLATE_TO_OBJECT[pos]
        try:
            elements = templates_parser.get_elements(
                output_object_class, definition)
        except Exception:
            return 1

        if code in POST_PROCESSORS:
            elements = POST_PROCESSORS[code](elements)

        if elements is None:
            print("No elements")
            return 0

        malagasy_definition = elements.to_malagasy_definition()
        lemma = elements.lemma
        # lemma = get_lemma(output_object_class, definition)
        print(elements, malagasy_definition, lemma)
    except ParserError as exc:
        print(exc)
        return 0

    # Do not create page if lemma does not exist
    if lemma:
        mg_lemma_page = pywikibot.Page(
            pywikibot.Site(SITELANG, SITENAME), lemma)
    else:
        return 1

    try:
        if not mg_lemma_page.exists():
            print('No lemma (%s) :/' % lemma)
            return 0
        else:
            broken_redirect = False
            while mg_lemma_page.isRedirectPage():
                mg_lemma_page = mg_lemma_page.getRedirectTarget()
                if not mg_lemma_page.exists():
                    broken_redirect = True
                    break

            if not broken_redirect:
                content = mg_lemma_page.get()
                if '{{=' + language_code + '=}}' not in content:
                    print('No lemma (%s) :/' % lemma)
                    return 0
            else:
                print('No lemma : broken redirect (%s)' % lemma)
                return 0
    except pywikibot.exceptions.InvalidTitle:  # doing something wrong at this point
        return 0
    except Exception as e:
        return 1

    form_of_template = FORM_OF_TEMPLATE[pos] if pos in FORM_OF_TEMPLATE else pos

    mg_entry = Entry(
        entry=word,
        part_of_speech=form_of_template,
        definitions=[malagasy_definition],
        language=code,
    )

    # Check ability to overwrite page
    if not os.path.isfile('/tmp/%s' % code):  # overwrite existing content!
        overwrite = False
    else:
        overwrite = True
        print(
            ('PAGE OVERWRITING IS ACTIVE. DELETE /tmp/%s TO DISABLE IT MID-SCRIPT.' % code))

    # Create or update the generated page
    if mg_page.exists() and not overwrite:
        new_entry = page_output.wikipage(mg_entry, link=False)
        page_content = mg_page.get()
        if page_content.find('{{=%s=}}' % code) != -1:
            if page_content.find(
                '{{-%s-|%s}}' %
                    (form_of_template, code)) != -1:
                print('section already exists : No need to go further')
                return 0
            else:  # Add part of speech subsection
                page_content = re.sub(
                    r'==[ ]?{{=%s=}}[ ]?==' %
                    code, new_entry, page_content)
        else:  # Add language section
            page_content = new_entry + '\n' + page_content
    else:  # Create a new page.
        page_content = page_output.wikipage(mg_entry, link=False)

    pywikibot.output('\03{blue}%s\03{default}' % page_content)
    try:
        mg_page.put(page_content, f'endriky ny teny [[{lemma}]]')
    except Exception:
        pass

    return 1
コード例 #4
0
ファイル: word_forms.py プロジェクト: radomd92/botjagwar
def import_additional_data(entry: Entry) -> int:
    word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.definitions[
        0]

    # Translate template's content into malagasy
    try:
        if pos not in TEMPLATE_TO_OBJECT:  # unsupported template
            print("Unsupported template")
            return 0

        output_object_class = TEMPLATE_TO_OBJECT[pos]
        try:
            elements = templates_parser.get_elements(
                output_object_class, definition)
        except Exception as exc:
            raise ParserError from exc

    except ParserError as exc:
        print(exc)
        return 0

    if code in POST_PROCESSORS:
        elements = POST_PROCESSORS[code](elements)

    if elements is None:
        print("No elements")
        return 0

    malagasy_definition = elements.to_malagasy_definition()
    lemma = elements.lemma

    def get_word_id_query():
        rq_params = {
            'word': 'eq.' + entry.entry,
            'language': 'eq.' + entry.language,
            'part_of_speech': 'eq.' + template.strip()
        }
        print(rq_params)
        response = requests.get(db_backend.backend + '/word', rq_params)
        return response.json()

    def post_new_word():
        rq_params = {
            'word': entry.entry,
            'language': entry.language,
            'part_of_speech': template.strip(),
            'date_changed': time.strftime("%Y-%m-%d %H:%M:%S")
        }
        print(rq_params)
        response = requests.post(db_backend.backend + '/word', rq_params)
        if response.status_code >= 400:
            print(response.json())
            raise AdditionalDataImporterError(
                f'Response on post is unexpected: {response.status_code}')

    try:
        query = get_word_id_query()
        if len(query) > 0:
            word_id = query[0]['id']
            importer.write_additional_data(word_id, lemma)
        else:
            post_new_word()
            query = get_word_id_query()
            assert len(query) > 0
            if len(query) > 0:
                word_id = query[0]['id']
                importer.write_additional_data(word_id, lemma)

    except (KeyError, AdditionalDataImporterError) as err:
        print(err)

    print(elements, malagasy_definition, lemma)
    return 0
コード例 #5
0
ファイル: word_forms.py プロジェクト: radomd92/botjagwar
def create_non_lemma_entry(entry: Entry):
    word, pos, code, definition = entry.entry, entry.part_of_speech, entry.language, entry.entry_definition[0]
    page_output = Output()
    mg_page = pywikibot.Page(pywikibot.Site(SITELANG, SITENAME), word)

    # Translate template's content into malagasy
    try:
        if pos not in TEMPLATE_TO_OBJECT:  # unsupported template
            print("Unsupported template")
            return 0
        output_object_class = TEMPLATE_TO_OBJECT[pos]
        try:
            elements = templates_parser.get_elements(output_object_class, definition)
        except Exception:
            return 1
        if code in POST_PROCESSORS:
            elements = POST_PROCESSORS[code](elements)
        if elements is None:
            print("No elements")
            return 0
        malagasy_definition = elements.to_malagasy_definition()
        lemma = elements.lemma
        # lemma = get_lemma(output_object_class, definition)
        print(elements, malagasy_definition, lemma)
    except ParserError as exc:
        print(exc)
        return 0

    # Do not create page if lemma does not exist
    if lemma not in PAGE_SET:
        print('No lemma (%s) :/' % lemma)
        #return 0

    form_of_template = FORM_OF_TEMPLATE[pos] if pos in FORM_OF_TEMPLATE else pos

    mg_entry = Entry(
        entry=word,
        part_of_speech=form_of_template,
        entry_definition=[malagasy_definition],
        language=code,
    )

    # Check ability to overwrite page
    if not os.path.isfile('/tmp/%s' % code):  # overwrite existing content!
        overwrite = False
    else:
        overwrite = True
        print(('PAGE OVERWRITING IS ACTIVE. DELETE /tmp/%s TO DISABLE IT MID-SCRIPT.' % code))

    # Create or update the generated page
    if mg_page.exists() and not overwrite:
        new_entry = page_output.wikipage(mg_entry, link=False)
        page_content = mg_page.get()
        if page_content.find('{{=%s=}}' % code) != -1:
            if page_content.find('{{-%s-|%s}}' % (form_of_template, code)) != -1:
                print('section already exists : No need to go further')
                return 0
            else:  # Add part of speech subsection
                page_content = re.sub(r'==[ ]?{{=%s=}}[ ]?==' % code, new_entry, page_content)
        else:  # Add language section
            page_content = new_entry + '\n' + page_content
    else:  # Create a new page.
        page_content = page_output.wikipage(mg_entry, link=False)

    pywikibot.output('\03{blue}%s\03{default}' % page_content)
    try:
        mg_page.put(page_content, 'Teny vaovao')
    except Exception:
        pass
    return 1
コード例 #6
0
    def create_missing_entries(self, xml_buffer: str):
        title_node, content_node = self.base_worker(xml_buffer)

        assert title_node is not None
        if ':' in title_node:
            return
        if self.processor_class is None:
            self.processor_class = WiktionaryProcessorFactory.create('en')
            assert self.processor_class is not None

        processor = self.processor_class()
        processor.set_title(title_node)
        processor.set_text(content_node)
        entries = processor.get_all_entries()

        for entry in entries:
            if entry.language == self.language:
                if self.translation_lookup_table.lookup(entry):
                    translation = self.translation_lookup_table.translate(
                        entry)
                    new_entry = Entry(entry=entry.entry,
                                      definitions=translation,
                                      language=entry.language,
                                      part_of_speech=entry.part_of_speech)
                    #print('local >', new_entry)
                    self.entry_writer.add(new_entry)
                    for e in processor.retrieve_translations():
                        e.definitions = translation
                        self.entry_writer.add(e)
                        #print('local translation >', e)
            else:
                # RIP cyclomatic complexity.
                translations = []
                pos = entry.part_of_speech
                for definition in entry.definitions:
                    try:
                        translation = self.translation_lookup_table.translate_word(
                            definition, language, entry.part_of_speech)
                    except LookupError:  # Translation couldn't be found in lookup table
                        if entry.part_of_speech in TEMPLATE_TO_OBJECT:
                            try:
                                # Try inflection template parser
                                elements = templates_parser.get_elements(
                                    TEMPLATE_TO_OBJECT[entry.part_of_speech],
                                    definition)
                            except Exception:
                                # add to missing translations
                                self.missing_translation_writer.add(definition)
                            else:
                                if elements:
                                    # part of speech changes to become a
                                    # form-of part of speech
                                    if not pos.startswith('e-'):
                                        pos = 'e-' + pos
                                    translations.append(
                                        elements.to_malagasy_definition())
                    else:
                        translations.append(translation[0])

                if translations:
                    new_entry = Entry(entry=entry.entry,
                                      definitions=list(set(translations)),
                                      language=entry.language,
                                      part_of_speech=pos)
                    #print('foreign >', new_entry)
                    self.entry_writer.add(new_entry)