コード例 #1
0
ファイル: table2words.py プロジェクト: koskenni/twol
def step1():
    import csv, re, sys
    import twol.cfg as cfg

    version = cfg.timestamp(__file__)

    import argparse
    argparser = argparse.ArgumentParser(
        "python3 paratab2segcsv.py",
        description="Converts a tabular csv paradigm into"\
        " one example per row CSV file. Version {} ".format(version))
    argparser.add_argument(
        "input",
        default="ksk-paradigms.csv",
        help="Paradigm table as a CSV file")
    argparser.add_argument(
        "output",
        default="ksk-seg-examp.csv",
        help="One example per row paradigm as a CSV file")
    argparser.add_argument(
        "-s", "--morph-separator",
        default=".",
        help="Boundary between the morphs in a table cell")
    argparser.add_argument(
        "-d", "--csv-delimiter",
        default=",",
        help="CSV delimiter between the two fields, default is ','")
    argparser.add_argument(
        "-n", "--name-separator",
        default=".",
        help="Separator between morpheme names"\
        " in the morpheme list, default is '.'")
    argparser.add_argument(
        "-z", "--zero-symbol",
        default="Ø",
        help="Symbol to be inserted in word forms in order to"\
        " align them, default is Ø.  You are discouraged to change it.")
    args = argparser.parse_args()

    out_file = open(args.output, "w")
    writer = csv.DictWriter(out_file,
                            ["MORPHEMES","MORPHS"],
                            delimiter=args.csv_delimiter)
    writer.writeheader()
    d = {}

    morph_set = {}
    seg_ex_list = []
    with open(args.input, "r") as csvfile:
        reader = csv.DictReader(csvfile,
                                delimiter=args.csv_delimiter,
                                skipinitialspace=True)
        for row in reader:
            if row["ID"].startswith("?"):
                continue
            # process each cell of the row
            for column_label, words in row.items(): 
                if (not words) or (column_label in {"ID", "KSK"}) \
                   or ("STM" not in column_label):
                    continue
                morpheme_list = column_label.split(args.name_separator)
                if morpheme_list[0] == 'STM':
                    morpheme_list[0] = row['ID']
                words_clean = re.sub(r'[][()]', '', words)
                word_list = re.split(r"\s+", words_clean)
                for morphs in word_list:
                    if not morphs or morphs.find('*') >= 0:
                        continue
                    d["MORPHEMES"] = args.name_separator.join(morpheme_list).strip()
                    d["MORPHS"] = morphs
                    writer.writerow(d)
    out_file.close()
    return
コード例 #2
0
def main():
    import re
    import csv
    import argparse
    import twol.cfg as cfg
    version = cfg.timestamp(__file__)

    argparser = argparse.ArgumentParser(
        "python3 raw2named.py",
        description="Renames raw morphophonemes. Version {} ".format(version))
    argparser.add_argument(
        "input",
        default="demo-raw.csv",
        help="aligned examples as a CSV file")
    argparser.add_argument(
        "output",
        default="demo-renamed.pstr",
        help="renamed examples as a space separated pair symbol strings")
    argparser.add_argument(
        "names",
        default="demo-renaming.csv",
        help="mapping from raw to neat morphophonemes as a CSV file,"\
        " default is ','")
    argparser.add_argument(
        "-d", "--delimiter",
        default=",",
        help="delimiter between raw name and new name fields, default is ','")
    argparser.add_argument(
        "-n", "--name-separator",
        default=".",
        help="Separator between morpheme names in the morpheme list,"\
        " default is '.'")
    argparser.add_argument(
        "-F", "--add-features",
        default=False, action="store_true",
        help="add affix morpheme names to the pairstring representation")
    argparser.add_argument(
        "-v", "--verbosity",
        default=0,
        type=int,
        help="level of diagnostic and debugging output")
    args = argparser.parse_args()

    import twol.cfg as cfg
    cfg.verbosity = args.verbosity

    mphon_name = { }

    # Read in the namefile is a CSV file which contains three fields:
    # 1. the raw (old) name for the mophophoneme
    # 2. a neat (new) name for the morphophoneme
    # 3. Comments documenting typical occurrences of the morphophoneme
    with open(args.names) as namefile:
        reader = csv.reader(namefile,
                            delimiter=args.delimiter,
                            skipinitialspace=True)
        for row in reader:
            if not row or (not row[0].strip()):
                continue
            if len(row) < 2:
                print("*** TOO FEW FIELDS IN:", row)
                continue
            if row[1].strip():
                mphon_name[row[0].strip()] = row[1].strip()

    #print(mphon_name)###

    outfil = open(args.output, "w")

    with open(args.input) as csvfile:
        reader = csv.DictReader(csvfile,
                                delimiter=args.delimiter,
                                skipinitialspace=True)
        for row in reader:
            zero_filled_str = row["ZEROFILLED"].strip().replace(".", "")
            raw_str = row["RAW"].strip()
            raw_lst = raw_str.split(" ")
            pairsym_lst = []
            if cfg.verbosity >= 20:
                print(row)
                print("raw_lst:", raw_lst)
            if len(raw_lst) != len(zero_filled_str):
                print("** LENGTHS DISAGREE **", raw_lst, zero_filled_str)
                continue

            for raw_insym, outsym in zip(raw_lst, zero_filled_str):
                if raw_insym == outsym:
                    psym = raw_insym
                else:
                    clean_insym = mphon_name.get(raw_insym, raw_insym)
                    psym = clean_insym + ":" + outsym
                pairsym_lst.append(psym)
            if args.add_features:
                morpheme_lst = row["MORPHEMES"].strip().split(args.name_separator)
                for morpheme in morpheme_lst[1:]:
                    pairsym_lst.append(morpheme + ":Ø")
            pairsym_str = " ".join(pairsym_lst)
            print(pairsym_str, file=outfil)

    return
コード例 #3
0
ファイル: zerofilled2raw.py プロジェクト: koskenni/twol
def main():

    import twol.cfg as cfg
    version = cfg.timestamp(__file__)

    import argparse
    argparser = argparse.ArgumentParser(
        "python3 zerofilled2raw.py",
        description="Forms raw morphophonemes out of zero-filled"\
        " morphs and produces a space-separated pair string"\
        " representation for the word suitable for"\
        " twol-comp or twol-discov. Version {}".format(version))
    argparser.add_argument("input",
                           help="zero-filled example words as a CSV file")
    argparser.add_argument(
        "output",
        help="The output file in CSV format with a new column"\
        " where the words are represented with raw"\
        " morhpophonemes from zero-filling.")
    argparser.add_argument(
        "affix_info",
        help="Principal forms and morphophonemic affixes as a CSV file")
    argparser.add_argument("-d",
                           "--csv-delimiter",
                           default=",",
                           help="Delimiter between the fields, default=','")
    argparser.add_argument(
        "-s",
        "--morph-separator",
        default=".",
        help="Separator between morphs in the word form, default='.'")
    argparser.add_argument(
        "-n",
        "--name-separator",
        default=".",
        help="Separator between morpheme names in the morpheme list")
    argparser.add_argument("-z",
                           "--zero-symbol",
                           default="Ø",
                           help="Symbol inserted in word forms to align them")
    argparser.add_argument("-v",
                           "--verbosity",
                           default=0,
                           type=int,
                           help="level of diagnostic and debugging output")
    args = argparser.parse_args()

    import re
    import csv
    import collections

    principal_lst = []
    """"List of principal forms or principal parts, i.e. the forms which
    uniquely determine the morphophonemic variations that may occur
    within the stem.

    """
    feat2mphons = {}

    # Read in the feature combinations of principal forms and
    # the morphophonemic representations of affix features
    with open(args.affix_info, "r") as afffil:
        affrdr = csv.reader(afffil,
                            delimiter=args.csv_delimiter,
                            skipinitialspace=True)
        for row in affrdr:
            if row[1] == '+':
                feat = row[0]
                if feat not in principal_lst:
                    principal_lst.append(feat)
            else:
                feat2mphons[row[0]] = row[1]
    if args.verbosity >= 10:
        print("principal_lst =", principal_lst)  ####
        print("feat2mphons =", feat2mphons)  ####

    # Read in the morpheme names and the zero-filled morphs

    stem_morpheme_data = collections.OrderedDict()
    """Indexed by stem morpheme name, value is a list of the original data
    for that stem morpheme.  Each value consists of a tuple of fields
    (MORPHEMES, MORPHS, ALIGNED) in the original data.
    """
    with open(args.input, "r") as infil:
        rdr = csv.DictReader(infil,
                             delimiter=args.csv_delimiter,
                             skipinitialspace=True)
        for row in rdr:
            names = row["MORPHEMES"].strip()
            orig_morphs = row["MORPHS"].strip()
            zerof_morphs = row["ZEROFILLED"].strip()
            if (not names) or (not zerof_morphs):
                continue
            name_lst = names.split(args.name_separator, maxsplit=1)
            stem_name = name_lst[0]
            form_name = ".".join(name_lst[1:]) if len(name_lst) > 1 else ""
            zerof_morph_lst = zerof_morphs.split(args.morph_separator,
                                                 maxsplit=1)
            if stem_name not in stem_morpheme_data:
                stem_morpheme_data[stem_name] = []
            stem_morpheme_data[stem_name].append(
                (form_name, orig_morphs, zerof_morph_lst))

    ofil = open(args.output, "w")
    writer = csv.DictWriter(
        ofil, fieldnames=["MORPHEMES", "MORPHS", "ZEROFILLED", "RAW"])
    writer.writeheader()

    for stem_morpheme, data_lst in stem_morpheme_data.items():
        princ_zstem_lst = []
        if args.verbosity >= 10:
            print("*** stem_morpheme, data_lst:", stem_morpheme, data_lst)
        # select the principal forms of this stem morpheme
        for data in data_lst:
            form_name, orig_morphs, zerof_morph_lst = data
            if form_name in principal_lst:
                princ_zstem_lst.append(zerof_morph_lst[0])
        # form the raw morphophonemes by combining corresponding
        # symbols
        if args.verbosity >= 10:
            print("*** princ_zstem_lst:", princ_zstem_lst)  ###
        lgth = len(princ_zstem_lst[0])
        zstem_rawsym_lst = []
        for i in range(lgth):
            lst = []
            for princ_zstem in princ_zstem_lst:
                lst.append(princ_zstem[i])
                # print(stem_morpheme, i, lst)###
            raw_seq = "".join(lst)
            if re.match(r"^(.)(\1)*$", raw_seq):
                raw_sym = raw_seq[0]  # abbreviate if all identical
            else:
                raw_sym = "{" + raw_seq + "}"
            zstem_rawsym_lst.append(raw_sym)
        zstem_pairsym_str = " ".join(zstem_rawsym_lst)
        # Output the data augmented with the representation with raw
        # morphophonemes
        for data in data_lst:
            form_name, orig_morphs, zerof_morph_lst = data
            form_part = args.name_separator + form_name if form_name else ""
            row["MORPHEMES"] = (stem_morpheme + form_part).strip()
            row["MORPHS"] = orig_morphs
            orig_zerof_morphs = args.morph_separator.join(zerof_morph_lst)
            row["ZEROFILLED"] = orig_zerof_morphs
            raw_lst = [zstem_pairsym_str]
            feat_lst = form_name.split(args.name_separator)
            for feat in feat_lst:
                raw_lst.append(feat2mphons[feat])
            row["RAW"] = " ".join(raw_lst)
            writer.writerow(row)
    return
コード例 #4
0
ファイル: discover.py プロジェクト: koskenni/twol
def main():

    version = cfg.timestamp(__file__)

    import argparse
    arpar = argparse.ArgumentParser(
        "twol-discov",
        description="Deduces two-level rules out of"\
        " a file of examples.  The file must consist of"\
        " lines of space-separated pair string.  Such a file"\
        " can be produced e.g. by twol-raw2renamed program."\
        " Version {}".format(version))
    arpar.add_argument("examples",
                       help="Example pair strings file",
                       default="test.pstr")
    arpar.add_argument(
        "-s", "--symbol",
        help="Input symbol for which to find rules."\
        " If not given then rules are proposed for"\
        " all morphophonemes in the example file",
        default="")
    arpar.add_argument(
        "-v", "--verbosity",
        help="Level of  diagnostic output, default is 5. Set to"\
        " 0 to omit the printing of relevant examples for the rules",
        type=int, default=5)
    args = arpar.parse_args()

    cfg.verbosity = args.verbosity

    twexamp.read_examples(filename=args.examples, build_fsts=False)
    if cfg.verbosity >= 10:
        print("--- all examples read in ---")

    for insym in cfg.input_symbol_set:
        pair_symbols_for_input[insym] = set()
    for insym, outsym in cfg.symbol_pair_set:
        pair_symbol = cfg.sympair2pairsym(insym, outsym)
        pair_symbols_for_input[insym].add(pair_symbol)

    if args.symbol:
        if args.symbol in pair_symbols_for_input:
            pair_set = pair_symbols_for_input[args.symbol]
            pair_lst = []
            for pairsym in pair_set:
                insym, outsym = cfg.pairsym2sympair(pairsym)
                pair_lst.append((insym, outsym))
            if cfg.verbosity >= 10:
                print("pair_lst:", pair_lst)
        else:
            print("Symbol {} not in the input alphabet of examples".format(
                args.symbol))
            lst = [
                insym for insym in pair_symbols_for_input.keys()
                if len(insym) > 2
            ]
            print("The following symbols are:", " ".join(sorted(lst)))
            exit("")
    else:
        pair_lst = sorted(cfg.symbol_pair_set)

    for insym, outsym in pair_lst:
        if len(pair_symbols_for_input[insym]) <= 1:
            continue
        pair_symbol = cfg.sympair2pairsym(insym, outsym)
        posi_contexts, nega_contexts = relevant_contexts(pair_symbol)
        pos_contexts, neg_contexts = minimal_contexts(pair_symbol,
                                                      posi_contexts.copy(),
                                                      nega_contexts.copy())
        if len(pos_contexts) <= len(neg_contexts) or cfg.verbosity > 0:
            print_rule(pair_symbol, "=>", pos_contexts)
        else:
            print_rule(pair_symbol, "/<=", neg_contexts)
        if args.verbosity >= 5:
            for lc, rc in posi_contexts:
                l_str = context_to_output_str(lc)
                r_str = context_to_output_str(rc)
                print("!{:>29}<{}>{}".format(l_str, outsym, r_str))
コード例 #5
0
ファイル: twolcomp.py プロジェクト: koskenni/twol
def main():

    version = cfg.timestamp(__file__)
    import argparse
    arpar = argparse.ArgumentParser(
        description="A compiler and tester for two-level rules."\
        " Version {}."\
        " See https://pytwolc.readthedocs.io/en/latest/index.html"\
        " or https://github.com/koskenni/twol"\
        " for more information.".format(version))
    arpar.add_argument(
        "-e", "--examples", action='store', nargs='+',
        help="""Either one name of a FST file that contains the examples or
            a list of names of files which contain the PSTR form examples
            used for compiling the rules.""",
        default=[None])
    arpar.add_argument(
        "-r", "--rules", action='store', nargs='+',
        help="""One or more files which contain the rules,
             either just one rule file or a file of defines
             as the first one and a part of the whole rule set
             as the second""",
        default=[None])
    arpar.add_argument(
        "-o", "--output",
        help="File to which write the compiled rules if a name is given",
        default="")
    arpar.add_argument(
        "-l", "--lost",
        help="File to which write the examples"\
        " that were not accepted by all rules"\
        " -- it is written as a FST",
        default="")
    arpar.add_argument(
        "-w", "--wrong",
        help="file to which write the wrong strings"\
        " that are accepted by all rules -- it is written as a FST",
        default="")
    arpar.add_argument(
        "-t", "--thorough",
        help="test each rule separately: 0 if no testing is desired,"\
        " 1 if against positive examples,"
        " 2 against both positive and negative examples."\
        " Default is 2.",
        type=int, choices=[0, 1, 2], default=2)
    arpar.add_argument(
        "--recursion",
        help="set the limit for recursion depth",
        type=int)
    arpar.add_argument(
        "-v", "--verbosity",
        help="level of  diagnostic output",
        type=int, default=0)

    args = arpar.parse_args()

    cfg.verbosity = args.verbosity
    if args.recursion:
        sys.setrecursionlimit(args.recursion)

    if len(args.examples) == 1 and args.examples[0].endswith(".fst"):
        twexamp.read_fst(args.examples[0])
    else:
        twexamp.read_examples(args.examples)

    if cfg.verbosity >= 30:
        twbt.ppfst(cfg.examples_fst, title="examples_fst")

    parser = twparser_init()

    examples_fsa = hfst.fst_to_fsa(cfg.examples_fst, separator="^")

    examples_up_fsa = cfg.examples_fst.copy()
    examples_up_fsa.input_project()
    if cfg.verbosity >= 30:
        twbt.ppfst(examples_up_fsa, title="examples_up_fsa")

    twrule.init()

    i = 0
    skip = False
    all_rules_fst_lst = []
    line_lst = []

    for line_nl in fileinput.input(args.rules):
        i += 1
        if not line_lst:
            line_nl_lst = []
        line_nl_lst.append(line_nl)
        line = line_nl.split('!', maxsplit=1)[0].strip()
        if line == "START":
            skip = False
            continue
        elif line == "STOP":
            skip = True
        if skip or (not line) or line.startswith("!"):
            continue
        line_lst.append(line)
        if not line.endswith(";"):
            continue
        else:
            rule_str = " ".join(line_lst)
            line_lst = []

        op, left, right = parse_rule(parser, rule_str, i, line_nl_lst)
        if op == "?" or not (left and right):
            continue

        if (args.thorough > 0 and op != "=") or cfg.verbosity > 0:
            print("\n")
            print(rule_str)

        if op == "=":
            #        if cfg.verbosity > 0:
            #            print(line)
            if cfg.verbosity >= 10:
                print(left, op)
                twbt.ppfst(right)
            continue
        elif op == "=>":
            R, selector_fst, MIXe = twrule.rightarrow(line, left, *right)
        elif op == "<=":
            R, selector_fst, MIXe = twrule.output_coercion(line, left, *right)
        elif op == "<--":
            R, selector_fst, MIXe = twrule.input_coercion(line, left, *right)
        elif op == "<=>":
            R, selector_fst, MIXe = twrule.doublearrow(line, left, *right)
        elif op == "/<=":
            R, selector_fst, MIXe = twrule.center_exclusion(line, left, *right)
        else:
            print("Error: not a valid type of a rule", op)
            continue
        if cfg.verbosity >= 10:
            twbt.ppfst(R)
        if args.lost or args.wrong or args.output:
            all_rules_fst_lst.append(R)
        if args.thorough > 0:
            selector_fst.intersect(cfg.examples_fst)
            # selector_fst.n_best(5)
            selector_fst.minimize()
            if cfg.verbosity >= 20:
                paths = selector_fst.extract_paths(output='raw')
                print_raw_paths(paths[0:20])
            passed_pos_examples_fst = selector_fst.copy()
            passed_pos_examples_fst.intersect(R)
            if args.thorough > 0:
                if passed_pos_examples_fst.compare(selector_fst):
                    print("All positive examples accepted")
                else:
                    lost_examples_fst = selector_fst.copy()
                    lost_examples_fst.minus(passed_pos_examples_fst)
                    lost_examples_fst.minimize()
                    print("** Some positive examples were rejected:")
                    lost_paths = lost_examples_fst.extract_paths(output='raw')
                    print_raw_paths(lost_paths[0:20])
        if args.thorough > 1 and op in {"=>", "<=", "<=>", "<--"}:
            neg_examples_fsa = examples_fsa.copy()
            neg_examples_fsa.compose(MIXe)
            neg_examples_fsa.output_project()
            neg_examples_fst = hfst.fsa_to_fst(neg_examples_fsa, separator="^")
            neg_examples_fst.minus(cfg.examples_fst)
            NG = examples_up_fsa.copy()
            NG.compose(neg_examples_fst)
            npaths = NG.extract_paths(output='raw')
            #print_raw_paths(npaths)
            passed_neg_examples_fst = NG.copy()
            passed_neg_examples_fst.intersect(R)
            if passed_neg_examples_fst.compare(hfst.empty_fst()):
                print("All negative examples rejected")
            else:
                print("** Some negative examples accepted:")
                npaths = passed_neg_examples_fst.extract_paths(output='raw')
                print_raw_paths(npaths[0:20])

    if args.lost or args.wrong:
        RESU = examples_up_fsa.copy()
        print(RESU.number_of_arcs(), "arcs in RESU")
        RESU.compose_intersect(tuple(all_rules_fst_lst))
        RESU.minimize()
    if args.lost:
        lost_positive_examples_fst = cfg.examples_fst.copy()
        lost_positive_examples_fst.minus(RESU)
        lost_positive_examples_fst.minimize()
        lost_stream = hfst.HfstOutputStream(filename=args.lost)
        lost_stream.write(lost_positive_examples_fst)
        lost_stream.flush()
        lost_stream.close()
        print("wrote lost examples to", args.lost)
    if args.wrong:
        WRONG = RESU.copy()
        WRONG.subtract(cfg.examples_fst)
        WRONG.minimize()
        wrong_stream = hfst.HfstOutputStream(filename=args.wrong)
        wrong_stream.write(WRONG)
        wrong_stream.flush()
        wrong_stream.close()
        print("wrote wrongly accepted examples to", args.wrong)
    if args.output:
        outstream = hfst.HfstOutputStream(filename=args.output)
        for fst in all_rules_fst_lst:
            outstream.write(fst)
        outstream.flush()
        outstream.close()
        print("wrote {} rule transducers to {}".format(len(all_rules_fst_lst),
                                                        args.output))
    return
コード例 #6
0
ファイル: aligner.py プロジェクト: koskenni/twol
def main():

    version = cfg.timestamp(__file__)

    import argparse
    arpar = argparse.ArgumentParser(
        "twol-aligner",
        description="""Aligns pairs of words separated by a
        colon. See https://pytwolc.readthedocs.io/en/latest/alignment.html
        for detailed instructions. Version {}""".format(version))
    arpar.add_argument(
        "metrics",
        help="FST computed with twol-metric from an alphabet file."
        " The FST contains weights for phoneme correspondences.")
    arpar.add_argument("-d",
                       "--delimiter",
                       help="Separates the two cognates, default is ' '",
                       default=" ")
    arpar.add_argument("-l",
                       "--layout",
                       choices=["vertical", "list", "horizontal"],
                       help="output layout",
                       default="vertical")
    arpar.add_argument("-c",
                       "--comment-separator",
                       help="""Comment separator. Comments in input after this
        character are just copied to output. Input words are then
        also copied to the end of comments. Default separator is ''
        i.e. no comments.  Comments come to the output only in
        horizontal layout.""",
                       default="")
    arpar.add_argument("-w",
                       "--weights",
                       help="print also the weight of each alignment."
                       " Default is not to print."
                       " Works only if a comment separator is also set.",
                       action="store_true")
    arpar.add_argument(
        "-n",
        "--number",
        help="number of best results to be printed. Default is 1",
        type=int,
        default=1)
    arpar.add_argument("-v",
                       "--verbosity",
                       help="Level of diagnostic information to be printed. "
                       "Default is 0",
                       type=int,
                       default=0)

    args = arpar.parse_args()
    cfg.verbosity = args.verbosity

    algfile = hfst.HfstInputStream(args.metrics)
    aligner_fst = algfile.read()

    separator = args.delimiter
    import sys
    import twol.multialign as multialign

    for line in sys.stdin:
        if args.comment_separator:
            pair, comm, comments = \
                line.strip().partition(args.comment_separator)
        else:
            pair, comm, comments = line.strip(), "", ""
        if args.verbosity > 0:
            print(pair, args.comment_separator, comm)
        in_word, sep, out_word = pair.strip().partition(args.delimiter)
        if not out_word:
            out_word = in_word

        raw_paths = align_two_words(in_word, out_word, aligner_fst, "Ø",
                                    args.number)
        for aligned_result in raw_paths:
            print_result(aligned_result,
                         comments,
                         args.weights,
                         layout=args.layout)

    return
コード例 #7
0
def main():

    version = cfg.timestamp(__file__)
    
    import argparse
    arpar = argparse.ArgumentParser(
        "twol-multialign",
        description="""Version {}\nAligns lists of words separated by
        a DELIMITER.  See
        https://pytwolc.readthedocs.io/en/latest/alignment.html for
        detailed instructions. """.format(version))
    arpar.add_argument(
        "alphabet",
        help="An alphabet definition file with features and similarity sets.")
    arpar.add_argument(
        "-d", "--delimiter",
        help="Separates the two cognates, default is ' '",
        default=" ")
    arpar.add_argument(
        "-l", "--layout",
        choices=["vertical","list","horizontal"],
        help="output layout",
        default="vertical")
    arpar.add_argument(
        "-c", "--comment-separator",
        help="""Comment separator. Comments in input after this
        character are just copied to output. Input words are then
        also copied to the end of comments. Default separator is ''
        i.e. no comments.  Comments come to the output only in
        horizontal layout.""",
        default="")
    arpar.add_argument(
        "-w", "--weights",
        help="print also the weight of each alignment."
        " Default is not to print."
        " Works only if a comment separator is also set.",
        action="store_true")
    arpar.add_argument(
        "-x", "--extra-zeros", default=0, type=int,
        help="number of extra zeros to be tried in alignment")
    arpar.add_argument(
        "-n", "--number",
        help="number of best results to be printed. Default is 1",
        type=int, default=1)
    arpar.add_argument(
        "-v", "--verbosity",
        help="Level of diagnostic information to be printed. "
        "Default is 0",
        type=int, default=0)

    args = arpar.parse_args()

    if args.verbosity:
        cfg.verbosity = args.verbosity

    init(args.alphabet, all_zero_weight=1000)

    separator = args.delimiter
    import sys
    for line in sys.stdin:
        if args.comment_separator:
            word_str, comm, comments = \
                line.strip().partition(args.comment_separator)
        else:
            word_str, comm, comments = line.strip(), "", ""
        if args.verbosity > 0:
            print(word_str, args.comment_separator, comm)
        word_lst =  word_str.strip().split(args.delimiter)

        aligned_results_lst = multialign(word_lst,
                                         zero="Ø",
                                         max_zeros=args.extra_zeros,
                                         best_count=args.number)
        if cfg.verbosity >= 10:
            print("aligned_results_lst:", aligned_results_lst)
        for aligned_result in aligned_results_lst:
            print_result(aligned_result,
                         comments,
                         args.weights,
                         layout=args.layout)
    return
コード例 #8
0
ファイル: words2zerofilled.py プロジェクト: koskenni/twol
def main():

    import twol.cfg as cfg
    version = cfg.timestamp(__file__)

    import argparse
    argparser = argparse.ArgumentParser(
        "python3 words2zerofilled.py",
        description="Aligns a set of word forms with morph boundaries"\
        " Version {} ".format(version))
    argparser.add_argument(
        "input",
        default="ksk-seg-examp.csv",
        help="moprheme names and segmented example words as a CSV file")
    argparser.add_argument(
        "output",
        default="ksk-alig-examp.csv",
        help="example words plus zero-filled aligned forms as a CSV file")
    argparser.add_argument(
        "alphabet",
        default="alphabet-test.text",
        help="An alphabet definition which determines"\
        " the weights for morphophonemes")
    argparser.add_argument(
        "-s",
        "--morph-separator",
        default=".",
        help="Separator between morphs in the word form, default is '.'")
    argparser.add_argument("-d",
                           "--csv-delimiter",
                           default=",",
                           help="Delimiter between the fields")
    argparser.add_argument(
        "-n", "--name-separator",
        default=".",
        help="separator between morpheme names"\
        " in the morpheme list,, default is '.'")
    argparser.add_argument(
        "-z",
        "--zero-symbol",
        default="Ø",
        help="symbol to be inserted in word forms to align them")
    argparser.add_argument(
        "-x",
        "--extra-zeros",
        default=0,
        type=int,
        help="number of extra zeros to be tried in alighnment")
    argparser.add_argument("-v",
                           "--verbosity",
                           default=0,
                           type=int,
                           help="level of diagnostic and debugging output")
    args = argparser.parse_args()

    import re
    import csv
    import collections
    import grapheme

    cfg.verbosity = args.verbosity

    # STEP 1:
    # Read in the segmented words and collect the allomorphs of each morpheme

    morphs_of_morpheme = {}
    """A dict to which allomorphs of each morpheme are collected:
    morphs_of_morpheme[morpheme_name] == ordered list its unique allomorphs.
    """
    seg_example_list = []
    """A list to which of all example words are collected. 
    Each word is represented as a list of (morpheme,morph) pairs.
    """
    stem_name_set = set()
    """Set of stem morphemes i.e. names of stem morphemes.
    """
    csvfile = open(args.input)

    reader = csv.DictReader(csvfile,
                            delimiter=args.csv_delimiter,
                            skipinitialspace=True)
    i = 0
    morphs_of_morpheme = {}
    for row in reader:
        morpheme_list = row["MORPHEMES"].strip().split(args.name_separator)
        morph_list = row["MORPHS"].strip().split(args.morph_separator)
        if args.verbosity >= 25:
            print(row["MORPHEMES"])
            print(morpheme_list)
            print(row["MORPHS"])
            print(morph_list)
        i = i + 1
        if len(morpheme_list) != len(morph_list):
            print("** line", i, ":", row["MORPHEMES"], "is incompatible with",
                  row["MORPHS"])
            continue
        if not morpheme_list:
            continue
        stem_name_set.add(morpheme_list[0])
        name_morph_pair_lst = list(zip(morpheme_list, morph_list))
        if args.verbosity >= 10:
            print("name_morph_pair_lst", name_morph_pair_lst)
        seg_example_list.append(name_morph_pair_lst)
        for morpheme, morph in name_morph_pair_lst:
            if args.verbosity >= 10:
                print("morpheme, morph:", morpheme, morph)
            morph = morph.strip()
            if morpheme not in morphs_of_morpheme:
                morphs_of_morpheme[morpheme] = [morph]
            else:
                if morph not in morphs_of_morpheme[morpheme]:
                    morphs = morphs_of_morpheme[morpheme]
                    morphs.append(morph)
                    morphs_of_morpheme[morpheme] = morphs
    if args.verbosity >= 5:
        print("morphs_of_morpheme", morphs_of_morpheme)

    csvfile.close()

    print("-- STEP 1 COMPLETED (seg_example_list, stem_name_set,"
          " morphs_of_morpheme done)--")

    # STEP 2:
    # align the allomorphs of each morpheme

    import twol.cfg as cfg
    #cfg.all_zero_weight = 1.0

    import twol.multialign as multialign

    multialign.init(args.alphabet, all_zero_weight=1)

    alignments = {}
    """All aligned morphs. index: morpheme name, value: sequence of
    aligned symbols.  Each aligned symbol has as many characters as
    there are items in the sequence.
    """

    for morpheme in sorted(morphs_of_morpheme.keys()):
        morphs = morphs_of_morpheme[morpheme]
        if len(morphs) == 1 and len(morphs[0]) == 0:
            aligned_morphs_lst = []
        else:
            if args.verbosity >= 5:
                print("morphs:", morphs)
            aligned_results_lst = \
                multialign.multialign(morphs,
                                      max_zeros=args.extra_zeros,
                                      best_count=1)
            if aligned_results_lst:
                weight, aligned_morphs_lst = aligned_results_lst[0]
            else:
                aligned_morphs_lst = []
        if args.verbosity >= 5:
            print("aligned_results_lst:", aligned_results_lst)
        alignments[morpheme] = aligned_morphs_lst

    print("-- STEP 2 COMPLETED (alignments done) --")

    # STEP 3:
    # Compute the zero filled morphs out of the sequences of aligned symbols

    aligned_morphs = {}
    """index: (morpheme, morph), value: zero-filled morph
    """

    for morpheme, aligned_morphs_lst in alignments.items():
        # e.g. "KOTA", ['kota', 'koda', 'kotØ', 'kodØ']
        if args.verbosity >= 5:
            print("aligned_morphs_lst:", aligned_morphs_lst)
        if morpheme not in aligned_morphs:
            aligned_morphs[morpheme] = collections.OrderedDict()
        if aligned_morphs_lst:
            original_morphs = [x.replace("Ø", "") for x in aligned_morphs_lst]
            for origm, zerofm in zip(original_morphs, aligned_morphs_lst):
                #if origm:
                #    aligned_morphs[morpheme][origm] = zerofm
                aligned_morphs[morpheme][origm] = zerofm
        else:
            aligned_morphs[morpheme] = {"": ""}
    if args.verbosity >= 5:
        print("aligned_morphs", aligned_morphs)

    print("-- STEP 3 COMPLETED (aligned_morphs done) --")

    # STEP 4:
    # Write the example word forms plus their a zero filled morphs

    out_file = open(args.output, "w", newline="")
    writer = csv.DictWriter(out_file, ["MORPHEMES", "MORPHS", "ZEROFILLED"],
                            delimiter=args.csv_delimiter)
    forms_of_morphs = {}

    writer.writeheader()
    d = {}
    for seg_example in seg_example_list:
        if args.verbosity >= 20:
            print("seg_example:", seg_example)
        morpheme_lst = [morpheme for morpheme, morph in seg_example]
        morph_lst = [morph for morpheme, morph in seg_example]
        zero_filled_morph_lst = \
            [aligned_morphs[morpheme].get(morph.replace("Ø", ""), "")
             for (morpheme, morph) in seg_example]
        if args.verbosity >= 20:
            print("zero_filled_morph_lst:", zero_filled_morph_lst)
        d["MORPHEMES"] = args.name_separator.join(morpheme_lst)
        d["MORPHS"] = args.morph_separator.join(morph_lst)
        d["ZEROFILLED"] = args.morph_separator.join(zero_filled_morph_lst)
        writer.writerow(d)
        if morph_lst[0] not in forms_of_morphs:
            forms_of_morphs[morph_lst[0]] = set()
        forms_of_morphs[morph_lst[0]].add(" ".join(x
                                                   for x in morpheme_lst[1:]))

    print("-- STEP 4 COMPLETED (zero-filled morphs and the CSV file done) --")
    return