def main(): # defaults outfiles = None # initialise argument parser ap = argparse.ArgumentParser(description="Guess more data for Finnish TSV databases") ap.add_argument("--quiet", "-q", action="store_false", dest="verbose", default=False, help="do not print output to stdout while processing") ap.add_argument("--verbose", "-v", action="store_true", default=False, help="print each step to stdout while processing") ap.add_argument("--input", "-i", action="store", required=True, metavar="IFILE", help="read data from IFILE") ap.add_argument("--version", "-V", action="version") ap.add_argument("--output", "-o", action="store", required=True, metavar="OFILE", help="write data to OFILE") ap.add_argument("--fields", "-f", action="store", default=2, metavar="N", help="read N fields from master") ap.add_argument("--join", "-j", action="store", required=True, metavar="JFILE", help="read join fields from JFILE") ap.add_argument("--separator", "-s", action="store", default="\t", metavar="SEP", help="use SEP as separator") ap.add_argument("--comment", "-C", action="append", default=["#"], metavar="COMMENT", help="skip lines starting with COMMENT that" "do not have SEPs") ap.add_argument("--strip", "-S", action="store", metavar="STRIP", help="strip STRIP characters") args = ap.parse_args() if args.strip == '"' or args.strip == "'": quoting = csv.QUOTE_ALL quotechar = args.strip else: quoting = csv.QUOTE_NONE quotechar = None errors = False joinmap = dict() # read joins from file if any with open(args.join, 'r', newline='') as joins: join_reader = csv.DictReader(joins, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) for join_parts in join_reader: if len(join_parts) < 3: print("Must have at leas N separators in joins; skipping", join_parts) continue key = join_parts['new_paras'].strip('[]') joinmap[key] = join_parts # read from csv files with open(args.output, 'w', newline='') as output: tsv_writer = csv.DictWriter(output, fieldnames=get_wordmap_fieldnames(), delimiter=args.separator, quoting=quoting, escapechar='%', quotechar=quotechar, strict=True) tsv_writer.writeheader() with open(args.input, 'r', newline='') as infile: tsv_reader = csv.reader(infile, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) linecount = 0 for tsv_parts in tsv_reader: linecount += 1 if args.verbose and (linecount % 10000 == 0): print(linecount, "...", sep='', end='\r') if len(tsv_parts) < args.fields: print("Must have at least N separators on each", "non-comment non-empty line; skipping:", tsv_parts, file=stderr) continue # here starts the guessworks # the aim is to fill dict wordmap with data necessary to # generate a lexc line wordmap = init_wordmap() wordmap = parse_defaults_from_tsv(wordmap, tsv_parts) wordmap = parse_extras_from_tsv(wordmap, tsv_parts) # Extend from known new paras joinkey = ",".join(wordmap['new_paras']) if joinkey in joinmap: for k,v in joinmap[joinkey].items(): if k != 'new_paras': if v == "False": wordmap[k] = False elif v == "None": wordmap[k] = None elif k == 'kotus_tn': wordmap[k] = int(v) else: wordmap[k] = v else: print("\033[93mMissing!\033[0m", "new para not in join data:", joinkey) errors = True continue # Guess works in order wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) # here is actual python code doing the pre-processing wordmap = plurale_tantum_get_singular_stem(wordmap) wordmap = gradation_make_morphophonemes(wordmap) wordmap = stub_all_ktn(wordmap) # suffixes can be id'd by the - in beginning. They need an own lexicon wordmap = guess_bound_morphs(wordmap) if wordmap['is_suffix']: wordmap['real_pos'] = wordmap['pos'] wordmap['pos'] = 'SUFFIX' # put interjections in separate lexicon to allow chaining them if "'PCLE_HAH'" in wordmap['new_paras']: wordmap['real_pos'] = wordmap['pos'] wordmap['pos'] = 'INTERJECTION' # split multiple particle or subcat definitions to distinct lexemes wordmaps = [wordmap] wordmaps = [ m for wm in wordmaps for m in split_wordmap_by_field(wm, 'particle')] wordmaps = [ m for wm in wordmaps for m in split_wordmap_by_field(wm, 'subcat')] wordmaps = [ m for wm in wordmaps for m in split_wordmap_by_field(wm, 'symbol')] # print result for wordmap in wordmaps: tsv_writer.writerow(wordmap) if errors: print("you must fix database integrity or hack the scripts", "before continuing") exit(1) exit()
def main(): ap = argparse.ArgumentParser( description= "Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields") ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument('--output', '-o', metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument('--plt-file', '-p', metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument('--verbose', '-v', action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, 'r', newline='') as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(',', 1) plt_info[lex] = plt.strip('"') elif line.find('HEADERS') >= 0: headers_skipped = True if args.input: input = open(args.input, 'r', newline='') else: input = stdin if args.output: output = open(args.output, 'w', newline='') else: output = stdout for line in input: if line.startswith('#') or line.find('<-HEADERS') >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap['stub'] = wordmap['lemma'] = fields[0] if args.verbose: print(wordmap['lemma']) wordmap['kotus_tn'] = int(fields[1]) wordmap['kotus_av'] = fields[2] if wordmap['kotus_av'] == '0': wordmap['kotus_av'] = False wordmap['pos'] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap['plurale_tantum'] = plt_info['"' + '","'.join(fields[0:4]) + '"'] for i in range(4, len(fields)): if fields[i].startswith('plt='): wordmap['plurale_tantum'] = fields[i] elif fields[i].startswith('boundaries='): fields[i] = fields[i].replace('|', '{WB}') wordmap['stub'] = wordmap['boundaries'] = fields[i] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap['extras'] = '\t'.join(fields[4:]) if wordmap['extras']: wordmap['extras'] = '\t' + wordmap['extras'] if args.verbose: print("Guessed new para: %(new_paras)r" % (wordmap)) print("%(lemma)s\t%(new_paras)r%(extras)s" % (wordmap), file=output) input.close() output.close() exit()
def main(): ap = argparse.ArgumentParser( description="Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields" ) ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument("--output", "-o", metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument("--plt-file", "-p", metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument("--verbose", "-v", action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, "r", newline="") as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(",", 1) plt_info[lex] = plt.strip('"') elif line.find("HEADERS") >= 0: headers_skipped = True if args.input: input = open(args.input, "r", newline="") else: input = stdin if args.output: output = open(args.output, "w", newline="") else: output = stdout for line in input: if line.startswith("#") or line.find("<-HEADERS") >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap["stub"] = wordmap["lemma"] = fields[0] if args.verbose: print(wordmap["lemma"]) wordmap["kotus_tn"] = int(fields[1]) wordmap["kotus_av"] = fields[2] if wordmap["kotus_av"] == "0": wordmap["kotus_av"] = False wordmap["pos"] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap["plurale_tantum"] = plt_info['"' + '","'.join(fields[0:4]) + '"'] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap["extras"] = "\t".join(fields[4:]) if wordmap["extras"]: wordmap["extras"] = "\t" + wordmap["extras"] if args.verbose: print("Guessed new para: %(new_paras)r" % (wordmap)) print("%(lemma)s\t%(new_paras)r%(extras)s" % (wordmap), file=output) input.close() output.close() exit()
def main(): ap = argparse.ArgumentParser(description= "Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields") ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument('--output', '-o', metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument('--plt-file', '-p', metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument('--verbose', '-v', action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, 'r', newline='') as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(',', 1) plt_info[lex] = plt.strip('"') elif line.find('HEADERS') >= 0: headers_skipped = True if args.input: input = open(args.input, 'r', newline='') else: input = stdin if args.output: output = open(args.output, 'w', newline='') else: output = stdout for line in input: if line.startswith('#') or line.find('<-HEADERS') >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap['stub'] = wordmap['lemma'] = fields[0] if args.verbose: print(wordmap['lemma']) wordmap['kotus_tn'] = int(fields[1]) wordmap['kotus_av'] = fields[2] if wordmap['kotus_av'] == '0': wordmap['kotus_av'] = False wordmap['pos'] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap['plurale_tantum'] = plt_info['"'+'","'.join(fields[0:4])+'"'] for i in range(4, len(fields)): if fields[i].startswith('plt='): wordmap['plurale_tantum'] = fields[i] elif fields[i].startswith('boundaries='): fields[i] = fields[i].replace('|', '{WB}') wordmap['stub'] = wordmap['boundaries'] = fields[i] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap['extras'] = '\t'.join(fields[4:]) if wordmap['extras']: wordmap['extras'] = '\t' + wordmap['extras'] if args.verbose: print("Guessed new para: %(new_paras)r" %(wordmap)) print("%(lemma)s\t%(new_paras)r%(extras)s" %(wordmap), file=output) input.close() output.close() exit()