Example #1
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(description="Find -ismo without -ista and vice versa")
    argparser.add_argument("file", help="Extract file to read")
    argparser.add_argument("--allforms", help="Allforms for checking lemmas")
    argparser.add_argument("--save", help="Save to wiktionary with specified commit message")
    argparser.add_argument("--date", help="Date of the database dump (used to generate page messages)")
    argparser.add_argument("--limit", type=int, help="Limit processing to first N articles")
    argparser.add_argument("--progress", help="Display progress", action='store_true')
    args = argparser.parse_args()

    count = 0

    allforms = AllForms.from_file(args.allforms) if args.allforms else None
    all_lemmas = set(allforms.all_lemmas)
    for article in WikiExtract.iter_articles_from_bz2(args.file):

        if not count % 1000 and args.progress:
            print(count, end = '\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        text = article.text
        path = article.title.split(":")
        page = path[0]
        pos = path[-1]

        if page.endswith("ismo"):
            error = "ismo_without_ista"
            search = page[:-4] + "ista"
        elif page.endswith("ista"):
            error = "ista_without_ismo"
            search = page[:-4] + "ismo"
        else:
            continue

        if page not in all_lemmas:
            continue

        if search in all_lemmas and search not in article.text:
            log(error, page)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
Example #2
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(
        description="Find lemmas with only 'form of' senses")
    argparser.add_argument("--trans", help="Extract file to read")
    argparser.add_argument("--allforms",
                           help="Allforms for resolving forms to lemmas")
    argparser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    argparser.add_argument(
        "--date",
        help="Date of the database dump (used to generate page messages)")
    argparser.add_argument("--limit",
                           type=int,
                           help="Limit processing to first N articles")
    argparser.add_argument("--progress",
                           help="Display progress",
                           action='store_true')
    argparser.add_argument("--dump-aliases",
                           help="Dump likely language aliases",
                           action='store_true')
    argparser.add_argument("--dump-parents",
                           help="Dump likely parent languages",
                           action='store_true')
    args = argparser.parse_args()

    allforms = AllForms.from_file(args.allforms) if args.allforms else None
    if not os.path.isfile(args.trans):
        raise FileNotFoundError(f"Cannot open: {args.trans}")

    fixer = T9nFixer(allforms)
    logger = Logger()

    def log(error, page, pos, gloss, language, line="", highlight=""):
        if error is None:
            raise ValueError("error is none")
        if page is None:
            raise ValueError("page is none")
        if pos is None:
            raise ValueError("pos is none")
        if gloss is None:
            gloss = ""
        if language is None:
            language = ""
        if line is None:
            line = ""
        if highlight is None:
            highlight = ""

        logger.add(error, page, pos, gloss, language, line, highlight)

#        if language:
#            langlogger.add(error, page, pos, gloss, language, line, highlight)

#        if error != "text_outside_template":
#            logger.add(error, page, pos, gloss, line, highlight)

    count = 0
    max_val = 0
    pages_with_tables = set()

    for article in WikiExtract.iter_articles_from_bz2(args.trans):
        text = article.text
        path = article.title.split(":")
        page = path[0]
        pos = path[-1]

        if pos not in ALL_POS:
            log("outside_pos", page, pos, None, None, path)

        count += 1
        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)
        if args.limit and count > args.limit:
            break

#        if page != "pie-eyed":
#            continue

#        if pathstr != "veggie:English:Adjective":
#        if pathstr != "I love you:English:Phrase":
#            continue
#        print("\n", count)

#        val = timeit.timeit(lambda: list(TranslationTable.find_tables(text, page, pos)), number=1)
#        if val > max_val:
#            max_val = val
#            max_page = pathstr
#        continue

        tables = list(TranslationTable.find_tables(text))
        if not len(tables) and not re.search(
                "{{\s*(trans-see|checktrans|see translation)", text):
            log("no_tables", page, pos, None, None)

#            max_page = "X"

        pages_with_tables.add(page)
        stats["sections_with_tables"] += 1
        for table_lines in tables:
            table_lines = table_lines.splitlines()
            #            print(table_lines)
            #            exit()
            #            max_val += len(table_lines)
            #            continue

            table = TranslationTable(page, pos, table_lines, log_function=log)

            stats["total_tables"] += 1
            seen = set()
            for item in table.items:
                if isinstance(item,
                              TranslationLine) and item.lang_id not in seen:
                    stats["total_entries"] += len(item.entries)
                    stats["lang_entries"][lang_ids[item.lang_id]] += 1
                    seen.add(item.lang_id
                             )  # Don't count more than one entry per table

            if len(tables) > 1 and not table.gloss and table.template in [
                    "tran-top", "trans-top-see", "trans-top-also"
            ]:
                table.log("no_gloss")
            fixer.cleanup_table(table)


#            if "\n".join(map(str.strip,table_lines)) != str(table):
#                table.log("botfix_formatting")
#                print("OLD", page, pos, file=sys.stderr)
#                print("\n".join(table_lines), file=sys.stderr)
#                print("NEW", page, pos)
#                print(str(table))
#exit()

    stats["pages_with_tables"] = len(pages_with_tables)

    #    print(max_val, max_page)

    #    base_url = "User:JeffDoozan/lists/translations" if args.save else "Xtranslations"
    #    langlogger.save(base_url, args.save)

    if args.save:
        base_url = "User:JeffDoozan/lists/translations"
        logger.save(base_url,
                    WikiByLanguage,
                    commit_message=args.save,
                    page_limit=1000,
                    data_date=args.date)
        logger.save(base_url + "/by_error",
                    WikiByError,
                    commit_message=args.save,
                    data_date=args.date)
    else:
        dest = "Xtranslations"
        logger.save(dest, FileByLanguage, page_limit=1000, data_date=args.date)
        logger.save(dest + "/by_error", FileByError, data_date=args.date)

    # Dump nested language aliases
    if args.dump_aliases:
        print("language_aliases = {")
        #for lang,codes in sorted(UNKNOWN_LANGS.items(), key=lambda x: sum(x[1].values())*-1):
        for lang, codes in sorted(UNKNOWN_LANGS.items()):
            for code, count in sorted(codes.items(), key=lambda x: x[1] * -1):
                if count > 20:
                    print(
                        f"    '{lang}': '{lang_ids[code]}', # {code} found in {count} entries"
                    )
                break
        print("}")

    if args.dump_parents:
        print("language_parents = {")
        for lang, count in sorted(LANG_PARENTS.items()):
            if count > 20:
                print(f"    '{lang}', # used in {count} entries")
        print("}")

    colons = [x for x in lang_ids.values() if ":" in x]
    if colons:
        raise ValueError(
            "A language exists with a colon in the name, this may cause problems for nested languages that use : as a separator"
        )

    print(f"Total pages with tables: {stats['pages_with_tables']}")
    print(f"Total sections with tables: {stats['sections_with_tables']}")
    total_lines = sum(stats["lang_entries"].values())
    print(f"Total language lines in tables: {total_lines}")
    print(f"Total translation entries: {stats['total_entries']}")
Example #3
0
    if not args.data_dir:
        args.data_dir = os.environ.get("SPANISH_DATA_DIR", "spanish_data")

    if not args.custom_dir:
        args.custom_dir = os.environ.get("SPANISH_CUSTOM_DIR", "spanish_custom")

    with open(args.dictionary) as wordlist_data:
        cache_words = not args.low_mem
        wordlist = Wordlist(wordlist_data, cache_words=cache_words)

    print("wordlist", mem_use(), file=sys.stderr)
    ignore_data = open(args.ignore) if args.ignore else []

    if args.allforms:
        allforms = AllForms.from_file(args.allforms)
    else:
        allforms = AllForms.from_wordlist(wordlist)
    print("all_forms", mem_use(), file=sys.stderr)

    sentences = spanish_sentences(
        sentences=args.sentences, data_dir=args.data_dir, custom_dir=args.custom_dir
    )

    flist = FrequencyList(wordlist, allforms, sentences)

    with open(args.file) as infile:
        for line in flist.process(infile, ignore_data):
            print(line)

    ignore_data.close()
Example #4
0
 def allforms(self):
     if not self._allforms:
         self._allforms = AllForms.from_file(self.allforms_file)
     return self._allforms
Example #5
0
def main():
    global args
    global ngprobs

    parser = argparse.ArgumentParser(description="Summarize ngram usage")
    parser.add_argument(
        "--allforms",
        help="Exclude coordinate terms that have standalone entries")
    parser.add_argument("--min-count",
                        help="Ignore forms with less than N uses",
                        type=int)
    parser.add_argument(
        "--min-percent",
        help=
        "Ignore coordinate terms used less than N percent of the form's uses",
        type=int)
    parser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    parser.add_argument(
        "--ignore2",
        help=
        "Ignore coords containing the specified word (can be used more than once)",
        action='append')
    parser.add_argument("--ngprobs", help="Ngram probability data file")
    parser.add_argument(
        "--coord2", help="File containing 2 word coordinate terms to check")
    parser.add_argument(
        "--coord3", help="File containing 3 word coordinate terms to check")
    parser.add_argument(
        "--coord4", help="File containing 4 word coordinate terms to check")
    args = parser.parse_args()

    allforms = AllForms.from_file(args.allforms)
    all_forms = set(allforms.all_forms)
    print("all_forms")
    lemma_forms = defaultdict(list)
    for form, pos, lemma in allforms.all:
        lemma_forms[(lemma, pos)].append(form)
    print("lemma_forms")

    alt_case = {
        form.lower(): form
        for form in all_forms if form != form.lower()
    }

    ngprobs = NgramPosProbability(args.ngprobs)

    if False:
        coord = "reharás tu vida"
        words = coord.split(" ")
        print([coord, get_coord_lemma(ngprobs, allforms, words)])
        exit()

        form = "fijamente"
        form_pos = ngprobs.get_preferred_pos(form)
        form_lemma = get_form_lemma(ngprobs, allforms, form)
        count = get_lemma_count(ngprobs, lemma_forms, form_lemma, form_pos)
        print([form, form_pos, form_lemma, count])
        exit()

    all_coords = {}
    if args.coord2:
        all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case,
                                  args.coord2, args.ignore2)
    if args.coord3:
        all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case,
                                  args.coord3)
    if args.coord4:
        all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case,
                                  args.coord4)


#    seen2 = set()
#    all_coords = {}
#    if args.coord3:
#        coord_lemmas = find_coords(allforms, all_forms, ngprobs, alt_case, args.coord3)
#        for k in coord_lemmas.keys():
#            coord_lemma, form, form_pos = k
#            words = coord_lemma.split(" ")
#            seen2.add(words[0:2])
#            seen2.add(words[1:3])
#        all_coords |= coord_lemmas
#
#    if args.coord2:
#        coord_lemmas = find_coords(allforms, all_forms, ngprobs, alt_case, args.coord2, args.ignore2)
#        for k,count in coord_lemmas.items():
#            coord_lemma, form, form_pos = k
#            if coord_lemma in seen2 and coord_lemma not in all_forms:
#                continue
#            all_coords[k] = count

    for k, coord_count in all_coords.items():
        coord_lemma, form_lemma, form_pos = k
        form_count = get_lemma_count(ngprobs, lemma_forms, form_lemma,
                                     form_pos)

        # Skip uncommon forms
        if form_count < args.min_count:
            continue

        # Min ratio
        percent = int(coord_count / form_count * 100)
        if percent < args.min_percent:
            continue

        existing = coord_lemma if coord_lemma in all_forms else coord_lemma.lower(
        ) if coord_lemma.lower() in all_forms else None
        if existing:
            coord_lemma = f"[[{existing}]]"

        log(form_lemma, coord_lemma, form_count, coord_count, percent)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
Example #6
0
def main():

    global fixer
    global fixrunner

    import argparse

    parser = argparse.ArgumentParser(
        description="Generate list of missing forms")
    parser.add_argument("wordlist", help="wordlist")
    parser.add_argument("--allforms", required=True, help="all_forms file")
    parser.add_argument("--allpages", required=True, help="wiki.allpages")
    parser.add_argument(
        "--articles",
        required=True,
        help="Language extract with raw articles, used for checking autofixes")
    parser.add_argument("--save", help="wiktionary commit message")
    parser.add_argument("--limit",
                        type=int,
                        help="Limit processing to first N articles")
    parser.add_argument("--progress",
                        help="Display progress",
                        action='store_true')
    args = parser.parse_args()

    global ARTICLE_FILE
    ARTICLE_FILE = args.articles

    wordlist = Wordlist.from_file(args.wordlist)

    allforms = AllForms.from_file(args.allforms)
    fixer = FormFixer(wordlist)
    fixrunner = FixRunner("es", wordlist, allforms)

    with open(args.allpages) as infile:
        # Loading the entire contents of allpages takes 600M
        # To conserve memory, temporarily load allforms into a set
        # and then create a set of entries in allpages that are also in allforms

        allforms_set = set(allforms.all_forms)
        allpages = {x.strip() for x in infile if x in allforms_set}
        del allforms_set

#    form = "achaparrándolo"
#    declared_forms = fixer.get_declared_forms(form, wordlist, allforms)
#    existing_forms = get_existing_forms(form, wordlist)
#    missing_forms, unexpected_forms = fixer.compare_forms(declared_forms, existing_forms)
#    print("declared", declared_forms)
#    print("existing", existing_forms)
#    print("missing", missing_forms)
#    print("unexpected", unexpected_forms)
#    exit()

    count = 0
    for form in allforms.all_forms:

        # Fix for conversion from <sup>x</sup> -> ^x
        if "^" in form:
            continue

        try:
            declared_forms = fixer.get_declared_forms(form, wordlist, allforms)
        except ValueError as e:
            print("ERROR", e)
            #error("form_errors", form, str(e))
            continue

        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        existing_forms = get_existing_forms(form, wordlist)

        missing_forms, unexpected_forms = fixer.compare_forms(
            declared_forms, existing_forms)

        missing_pos = []
        for item in missing_forms:

            if item.form != form:
                raise ValueError(form, item)

            if not FormFixer.can_handle_formtype(item.formtype):
                continue

            # TODO: for now skip multi word verbs
            if item.pos == "v" and " " in item.lemma:
                continue

            if item.pos == "n" and item.formtype == "m":
                error("should_be_lemma", form, item)
                continue

            words = list(wordlist.get_words(form, item.pos))
            if not words:
                matches = list(wordlist.get_words(form))
                if matches:
                    if item.pos in missing_pos:
                        continue
                    ety = {w.etymology for w in matches}
                    level = 4 if len(ety) > 1 else 3
                    #                    error("missing_pos_multi_ety", form, item)
                    items = [i for i in missing_forms if i.pos == item.pos]

                    if fixer.can_handle(item):
                        pos_text = "\n".join(
                            fixer.full_pos(item.form, level, items))
                    else:
                        pos_text = ""
                    error("missing_pos", form, item, pos_text)
                    missing_pos.append(item.pos)
                else:
                    if form in allpages:
                        error("missing_entry", form, item)

                continue


#            if pos == "n" and formtype == "pl" and unexpected_forms:
#                masculines = get_masculines_from_fpl(words[0])
#                masculine_links = [m for m in masculines if (pos, "fpl", m) in unexpected_forms]
#                if masculine_links:
#                    for m in masculine_links:
#                        unexpected_forms.remove((pos, "fpl", m))
#                    print(f"{form}:{pos} links to masculine {masculine_links} instead of feminine $is_doublet")
#                    continue

            error("missing_sense", form, item)

        for item in sorted(unexpected_forms):
            words = list(wordlist.get_words(item.lemma, item.pos))
            if words:
                error("unexpected_form", form, item)
            else:
                error("missing_lemma", form, item)

    if args.save:
        base_url = "User:JeffDoozan/lists/es/forms"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        logger.save("forms", FileSaver)