Exemple #1
0
def main():
    # initialise argument parser
    ap = argparse.ArgumentParser(description="Guess more data for Finnish TSV databases")
    ap.add_argument(
        "--quiet",
        "-q",
        action="store_false",
        dest="verbose",
        default=False,
        help="do not print output to stdout while processing",
    )
    ap.add_argument(
        "--verbose", "-v", action="store_true", default=False, help="print each step to stdout while processing"
    )
    ap.add_argument("--input", "-i", action="store", required=True, metavar="IFILE", help="read data from IFILE")
    ap.add_argument("--version", "-V", action="version")
    ap.add_argument("--output", "-o", action="store", required=True, metavar="OFILE", help="write data to OFILE")
    ap.add_argument("--fields", "-f", action="store", default=2, metavar="N", help="read N fields from master")
    ap.add_argument("--join", "-j", action="store", required=True, metavar="JFILE", help="read join fields from JFILE")
    ap.add_argument(
        "--stub", "-c", action="store", required=True, metavar="SFILE", help="read stub expressions from SFILE"
    )
    ap.add_argument("--separator", "-s", action="store", default="\t", metavar="SEP", help="use SEP as separator")
    ap.add_argument(
        "--comment",
        "-C",
        action="append",
        default=["#"],
        metavar="COMMENT",
        help="skip lines starting with COMMENT that" "do not have SEPs",
    )
    ap.add_argument("--strip", "-S", action="store", metavar="STRIP", help="strip STRIP characters")
    args = ap.parse_args()

    if args.strip == '"' or args.strip == "'":
        quoting = csv.QUOTE_ALL
        quotechar = args.strip
    else:
        quoting = csv.QUOTE_NONE
        quotechar = None

    errors = False
    joinmap = dict()
    # read joins from file if any
    with open(args.join, "r", newline="") as joins:
        join_reader = csv.DictReader(joins, delimiter=args.separator, quoting=quoting, escapechar="\\", strict=True)
        for join_parts in join_reader:
            if len(join_parts) < 3:
                print("Must have at leas N separators in joins; skipping", join_parts)
                continue
            key = join_parts["new_para"]
            joinmap[key] = join_parts
    stubmap = dict()
    with open(args.stub, "r", newline="") as stubs:
        stub_reader = csv.DictReader(stubs, delimiter=args.separator, quoting=quoting, escapechar="\\", strict=True)
        for stub_parts in stub_reader:
            if len(stub_parts) < 2:
                print("Must have at least N separators in stubbings; skipping", stub_parts)
                continue
            key = stub_parts["new_para"]
            stubmap[key] = stub_parts["deletion"]

    # read from csv files
    with open(args.output, "w", newline="") as output:
        tsv_writer = csv.DictWriter(
            output,
            fieldnames=get_wordmap_fieldnames(),
            delimiter=args.separator,
            quoting=quoting,
            escapechar="%",
            quotechar=quotechar,
            strict=True,
        )
        tsv_writer.writeheader()
        with open(args.input, "r", newline="") as infile:
            tsv_reader = csv.reader(infile, delimiter=args.separator, quoting=quoting, escapechar="\\", strict=True)
            linecount = 0
            for tsv_parts in tsv_reader:
                linecount += 1
                if args.verbose and (linecount % 10000 == 0):
                    print(linecount, "...", sep="", end="\r")
                if len(tsv_parts) < args.fields:
                    print(
                        "Must have at least N separators on each",
                        "non-comment non-empty line; skipping:",
                        tsv_parts,
                        file=stderr,
                    )
                    continue
                # here starts the guessworks
                # the aim is to fill dict wordmap with data necessary to
                # generate a lexc line
                wordmap = init_wordmap()
                wordmap = parse_defaults_from_tsv(wordmap, tsv_parts)
                wordmap = parse_extras_from_tsv(wordmap, tsv_parts)
                # Extend from known new paras
                joinkey = wordmap["new_para"]
                if joinkey in joinmap:
                    for k, v in joinmap[joinkey].items():
                        if k != "new_para":
                            if v == "False":
                                wordmap[k] = False
                            elif v == "None":
                                wordmap[k] = None
                            elif k == "kotus_tn":
                                try:
                                    wordmap[k] = int(v)
                                except:
                                    print("FAIL", k, v, tsv_parts)
                                    exit(2)
                            else:
                                wordmap[k] = v
                else:
                    print(
                        "\033[93mMissing!\033[0m",
                        "new para not in join data:",
                        joinkey,
                        "\n\033[92mExplanation:\033[0m" "add paradigm to morphophonology.tsv and carry on",
                    )
                    errors = True
                    continue

                # Guess-works in order
                # wordmap = guess_stem_features_ktn(wordmap)
                # wordmap = guess_pronunciation(wordmap)
                # wordmap = guess_grade_dir_from_ktn(wordmap)
                # wordmap = guess_harmony(wordmap)
                # wordmap = guess_new_class(wordmap)
                wordmap = stub_all_new_para(wordmap, stubmap)
                if not wordmap:
                    errors = True
                    continue
                # suffixes can be id'd by the - in beginning. They need an own
                # lexicon
                wordmap = guess_bound_morphs(wordmap)
                if wordmap["is_suffix"]:
                    wordmap["real_pos"] = wordmap["pos"]
                    wordmap["pos"] = "SUFFIX"
                if "PCLE_HAH" == wordmap["new_para"]:
                    wordmap["real_pos"] = wordmap["pos"]
                    wordmap["pos"] = "INTERJECTION"
                wordmaps = [wordmap]
                for wordmap in wordmaps:
                    tsv_writer.writerow(wordmap)
    if errors:
        print("you must fix database integrity or hack the scripts", "before continuing")
        exit(1)

    exit()
Exemple #2
0
def main():
    # initialise argument parser
    ap = argparse.ArgumentParser(
        description="Guess more data for Finnish TSV databases")
    ap.add_argument("--quiet",
                    "-q",
                    action="store_false",
                    dest="verbose",
                    default=False,
                    help="do not print output to stdout while processing")
    ap.add_argument("--verbose",
                    "-v",
                    action="store_true",
                    default=False,
                    help="print each step to stdout while processing")
    ap.add_argument("--input",
                    "-i",
                    action="store",
                    required=True,
                    metavar="IFILE",
                    help="read data from IFILE")
    ap.add_argument("--version", "-V", action="version")
    ap.add_argument("--output",
                    "-o",
                    action="store",
                    required=True,
                    metavar="OFILE",
                    help="write data to OFILE")
    ap.add_argument("--fields",
                    "-f",
                    action="store",
                    default=2,
                    metavar="N",
                    help="read N fields from master")
    ap.add_argument("--join",
                    "-j",
                    action="store",
                    required=True,
                    metavar="JFILE",
                    help="read join fields from JFILE")
    ap.add_argument("--stub",
                    "-c",
                    action="store",
                    required=True,
                    metavar="SFILE",
                    help="read stub expressions from SFILE")
    ap.add_argument("--separator",
                    "-s",
                    action="store",
                    default="\t",
                    metavar="SEP",
                    help="use SEP as separator")
    ap.add_argument("--comment",
                    "-C",
                    action="append",
                    default=["#"],
                    metavar="COMMENT",
                    help="skip lines starting with COMMENT that"
                    "do not have SEPs")
    ap.add_argument("--strip",
                    "-S",
                    action="store",
                    metavar="STRIP",
                    help="strip STRIP characters")
    args = ap.parse_args()

    if args.strip == '"' or args.strip == "'":
        quoting = csv.QUOTE_ALL
        quotechar = args.strip
    else:
        quoting = csv.QUOTE_NONE
        quotechar = None

    errors = False
    joinmap = dict()
    # read joins from file if any
    with open(args.join, 'r', newline='') as joins:
        join_reader = csv.DictReader(joins,
                                     delimiter=args.separator,
                                     quoting=quoting,
                                     escapechar='\\',
                                     strict=True)
        for join_parts in join_reader:
            if len(join_parts) < 3:
                print("Must have at leas N separators in joins; skipping",
                      join_parts)
                continue
            key = join_parts['new_para']
            joinmap[key] = join_parts
    stubmap = dict()
    with open(args.stub, 'r', newline='') as stubs:
        stub_reader = csv.DictReader(stubs,
                                     delimiter=args.separator,
                                     quoting=quoting,
                                     escapechar='\\',
                                     strict=True)
        for stub_parts in stub_reader:
            if len(stub_parts) < 2:
                print("Must have at least N separators in stubbings; skipping",
                      stub_parts)
                continue
            key = stub_parts['new_para']
            stubmap[key] = stub_parts['deletion']

    # read from csv files
    with open(args.output, 'w', newline='') as output:
        tsv_writer = csv.DictWriter(output,
                                    fieldnames=get_wordmap_fieldnames(),
                                    delimiter=args.separator,
                                    quoting=quoting,
                                    escapechar='%',
                                    quotechar=quotechar,
                                    strict=True)
        tsv_writer.writeheader()
        with open(args.input, 'r', newline='') as infile:
            tsv_reader = csv.reader(infile,
                                    delimiter=args.separator,
                                    quoting=quoting,
                                    escapechar='\\',
                                    strict=True)
            linecount = 0
            for tsv_parts in tsv_reader:
                linecount += 1
                if args.verbose and (linecount % 10000 == 0):
                    print(linecount, "...", sep='', end='\r')
                if len(tsv_parts) < args.fields:
                    print("Must have at least N separators on each",
                          "non-comment non-empty line; skipping:",
                          tsv_parts,
                          file=stderr)
                    continue
                # here starts the guessworks
                # the aim is to fill dict wordmap with data necessary to
                # generate a lexc line
                wordmap = init_wordmap()
                wordmap = parse_defaults_from_tsv(wordmap, tsv_parts)
                wordmap = parse_extras_from_tsv(wordmap, tsv_parts)
                # Extend from known new paras
                joinkey = wordmap['new_para']
                if joinkey in joinmap:
                    for k, v in joinmap[joinkey].items():
                        if k != 'new_para':
                            if v == "False":
                                wordmap[k] = False
                            elif v == "None":
                                wordmap[k] = None
                            elif k == 'kotus_tn':
                                try:
                                    wordmap[k] = int(v)
                                except:
                                    print("FAIL", k, v, tsv_parts)
                                    exit(2)
                            else:
                                wordmap[k] = v
                else:
                    print(
                        "\033[93mMissing!\033[0m",
                        "new para not in join data:", joinkey,
                        "\n\033[92mExplanation:\033[0m"
                        "add paradigm to morphophonology.tsv and carry on")
                    errors = True
                    continue

                # Guess-works in order
                # wordmap = guess_stem_features_ktn(wordmap)
                # wordmap = guess_pronunciation(wordmap)
                # wordmap = guess_grade_dir_from_ktn(wordmap)
                # wordmap = guess_harmony(wordmap)
                # wordmap = guess_new_class(wordmap)
                wordmap = stub_all_new_para(wordmap, stubmap)
                if not wordmap:
                    errors = True
                    continue
                # suffixes can be id'd by the - in beginning. They need an own
                # lexicon
                wordmap = guess_bound_morphs(wordmap)
                if wordmap['is_suffix']:
                    wordmap['real_pos'] = wordmap['pos']
                    wordmap['pos'] = 'SUFFIX'
                if "PCLE_HAH" == wordmap['new_para']:
                    wordmap['real_pos'] = wordmap['pos']
                    wordmap['pos'] = 'INTERJECTION'
                wordmaps = [wordmap]
                for wordmap in wordmaps:
                    tsv_writer.writerow(wordmap)
    if errors:
        print("you must fix database integrity or hack the scripts",
              "before continuing")
        exit(1)

    exit()
Exemple #3
0
def main():
    ap = argparse.ArgumentParser(description="Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv "
                                 "with possible attribute fields")

    ap.add_argument("--input", "-i", metavar="INFILE",
                    help="read data from INFILE")
    ap.add_argument('--output', '-o', metavar="OUTFILE",
                    help="write converted stuff to OUTFILE")
    ap.add_argument('--plt-file', '-p', metavar="PLTFILE",
                    help="read plurale tantum info (csv) from PLTFILE")
    ap.add_argument('--verbose', '-v', action="store_true",
                    help="Print verbosely while processing")
    ap.add_argument("--version", "-V", action="version")
    args = ap.parse_args()

    plt_info = defaultdict(lambda: False)
    if args.plt_file:
        if args.verbose:
            print("Reading plurale tantum data from",
                  args.plt_file, file=stderr)
        with open(args.plt_file, 'r', newline='') as plt_in:
            headers_skipped = False
            for line in plt_in:
                if headers_skipped:
                    lex, plt = line.rsplit(',', 1)
                    plt_info[lex] = plt.strip('"')
                elif line.find('HEADERS') >= 0:
                    headers_skipped = True

    if args.input:
        input = open(args.input, 'r', newline='')
    else:
        input = stdin
    if args.output:
        output = open(args.output, 'w', newline='')
    else:
        output = stdout

    for line in input:
        if line.startswith('#') or line.find('<-HEADERS') >= 0:
            continue
        fields = line.strip('"\n').split('","')
        if len(fields) < 4:
            if len(fields) > 0:
                if args.verbose:
                    print("Skipping too short line:", line, file=stderr)
            continue
        wordmap = init_wordmap()
        wordmap['stub'] = wordmap['lemma'] = fields[0]
        if args.verbose:
            print(wordmap['lemma'])
        wordmap['kotus_tn'] = int(fields[1])
        wordmap['kotus_av'] = fields[2]
        if wordmap['kotus_av'] == '0':
            wordmap['kotus_av'] = False
        wordmap['pos'] = fields[3]
        wordmap = expand_pos(wordmap)
        if plt_info:
            wordmap['plurale_tantum'] = plt_info[
                '"' + '","'.join(fields[0:4]) + '"']
        for i in range(4, len(fields)):
            if fields[i].startswith('plt='):
                wordmap['plurale_tantum'] = fields[i]
            elif fields[i].startswith('boundaries='):
                fields[i] = fields[i].replace('|', '{WB}')
                wordmap['stub'] = wordmap['boundaries'] = fields[i]
        wordmap = guess_stem_features_ktn(wordmap)
        wordmap = guess_pronunciation(wordmap)
        wordmap = guess_grade_dir_from_ktn(wordmap)
        wordmap = guess_harmony(wordmap)
        wordmap = guess_new_class(wordmap)

        wordmap['extras'] = '\t'.join(fields[4:])
        if wordmap['extras']:
            wordmap['extras'] = '\t' + wordmap['extras']

        if args.verbose:
            print("Guessed new para: %(new_para)r" % (wordmap))
        print("%(lemma)s\t%(new_para)r%(extras)s" % (wordmap), file=output)

    input.close()
    output.close()
    exit()