def main(): # defaults outfiles = None # initialise argument parser ap = argparse.ArgumentParser(description="Guess more data for Finnish TSV databases") ap.add_argument("--quiet", "-q", action="store_false", dest="verbose", default=False, help="do not print output to stdout while processing") ap.add_argument("--verbose", "-v", action="store_true", default=False, help="print each step to stdout while processing") ap.add_argument("--input", "-i", action="store", required=True, metavar="IFILE", help="read data from IFILE") ap.add_argument("--version", "-V", action="version") ap.add_argument("--output", "-o", action="store", required=True, metavar="OFILE", help="write data to OFILE") ap.add_argument("--fields", "-f", action="store", default=2, metavar="N", help="read N fields from master") ap.add_argument("--join", "-j", action="store", required=True, metavar="JFILE", help="read join fields from JFILE") ap.add_argument("--separator", "-s", action="store", default="\t", metavar="SEP", help="use SEP as separator") ap.add_argument("--comment", "-C", action="append", default=["#"], metavar="COMMENT", help="skip lines starting with COMMENT that" "do not have SEPs") ap.add_argument("--strip", "-S", action="store", metavar="STRIP", help="strip STRIP characters") args = ap.parse_args() if args.strip == '"' or args.strip == "'": quoting = csv.QUOTE_ALL quotechar = args.strip else: quoting = csv.QUOTE_NONE quotechar = None errors = False joinmap = dict() # read joins from file if any with open(args.join, 'r', newline='') as joins: join_reader = csv.DictReader(joins, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) for join_parts in join_reader: if len(join_parts) < 3: print("Must have at leas N separators in joins; skipping", join_parts) continue key = join_parts['new_paras'].strip('[]') joinmap[key] = join_parts # read from csv files with open(args.output, 'w', newline='') as output: tsv_writer = csv.DictWriter(output, fieldnames=get_wordmap_fieldnames(), delimiter=args.separator, quoting=quoting, escapechar='%', quotechar=quotechar, strict=True) tsv_writer.writeheader() with open(args.input, 'r', newline='') as infile: tsv_reader = csv.reader(infile, delimiter=args.separator, quoting=quoting, escapechar='\\', strict=True) linecount = 0 for tsv_parts in tsv_reader: linecount += 1 if args.verbose and (linecount % 10000 == 0): print(linecount, "...", sep='', end='\r') if len(tsv_parts) < args.fields: print("Must have at least N separators on each", "non-comment non-empty line; skipping:", tsv_parts, file=stderr) continue # here starts the guessworks # the aim is to fill dict wordmap with data necessary to # generate a lexc line wordmap = init_wordmap() wordmap = parse_defaults_from_tsv(wordmap, tsv_parts) wordmap = parse_extras_from_tsv(wordmap, tsv_parts) # Extend from known new paras joinkey = ",".join(wordmap['new_paras']) if joinkey in joinmap: for k,v in joinmap[joinkey].items(): if k != 'new_paras': if v == "False": wordmap[k] = False elif v == "None": wordmap[k] = None elif k == 'kotus_tn': wordmap[k] = int(v) else: wordmap[k] = v else: print("\033[93mMissing!\033[0m", "new para not in join data:", joinkey) errors = True continue # Guess works in order wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) # here is actual python code doing the pre-processing wordmap = plurale_tantum_get_singular_stem(wordmap) wordmap = gradation_make_morphophonemes(wordmap) wordmap = stub_all_ktn(wordmap) # suffixes can be id'd by the - in beginning. They need an own lexicon wordmap = guess_bound_morphs(wordmap) if wordmap['is_suffix']: wordmap['real_pos'] = wordmap['pos'] wordmap['pos'] = 'SUFFIX' # put interjections in separate lexicon to allow chaining them if "'PCLE_HAH'" in wordmap['new_paras']: wordmap['real_pos'] = wordmap['pos'] wordmap['pos'] = 'INTERJECTION' # split multiple particle or subcat definitions to distinct lexemes wordmaps = [wordmap] wordmaps = [ m for wm in wordmaps for m in split_wordmap_by_field(wm, 'particle')] wordmaps = [ m for wm in wordmaps for m in split_wordmap_by_field(wm, 'subcat')] wordmaps = [ m for wm in wordmaps for m in split_wordmap_by_field(wm, 'symbol')] # print result for wordmap in wordmaps: tsv_writer.writerow(wordmap) if errors: print("you must fix database integrity or hack the scripts", "before continuing") exit(1) exit()
def main(): ap = argparse.ArgumentParser( description="Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields" ) ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument("--output", "-o", metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument("--plt-file", "-p", metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument("--verbose", "-v", action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, "r", newline="") as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(",", 1) plt_info[lex] = plt.strip('"') elif line.find("HEADERS") >= 0: headers_skipped = True if args.input: input = open(args.input, "r", newline="") else: input = stdin if args.output: output = open(args.output, "w", newline="") else: output = stdout for line in input: if line.startswith("#") or line.find("<-HEADERS") >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap["stub"] = wordmap["lemma"] = fields[0] if args.verbose: print(wordmap["lemma"]) wordmap["kotus_tn"] = int(fields[1]) wordmap["kotus_av"] = fields[2] if wordmap["kotus_av"] == "0": wordmap["kotus_av"] = False wordmap["pos"] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap["plurale_tantum"] = plt_info['"' + '","'.join(fields[0:4]) + '"'] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap["extras"] = "\t".join(fields[4:]) if wordmap["extras"]: wordmap["extras"] = "\t" + wordmap["extras"] if args.verbose: print("Guessed new para: %(new_paras)r" % (wordmap)) print("%(lemma)s\t%(new_paras)r%(extras)s" % (wordmap), file=output) input.close() output.close() exit()
def main(): ap = argparse.ArgumentParser( description= "Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields") ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument('--output', '-o', metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument('--plt-file', '-p', metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument('--verbose', '-v', action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, 'r', newline='') as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(',', 1) plt_info[lex] = plt.strip('"') elif line.find('HEADERS') >= 0: headers_skipped = True if args.input: input = open(args.input, 'r', newline='') else: input = stdin if args.output: output = open(args.output, 'w', newline='') else: output = stdout for line in input: if line.startswith('#') or line.find('<-HEADERS') >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap['stub'] = wordmap['lemma'] = fields[0] if args.verbose: print(wordmap['lemma']) wordmap['kotus_tn'] = int(fields[1]) wordmap['kotus_av'] = fields[2] if wordmap['kotus_av'] == '0': wordmap['kotus_av'] = False wordmap['pos'] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap['plurale_tantum'] = plt_info['"' + '","'.join(fields[0:4]) + '"'] for i in range(4, len(fields)): if fields[i].startswith('plt='): wordmap['plurale_tantum'] = fields[i] elif fields[i].startswith('boundaries='): fields[i] = fields[i].replace('|', '{WB}') wordmap['stub'] = wordmap['boundaries'] = fields[i] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap['extras'] = '\t'.join(fields[4:]) if wordmap['extras']: wordmap['extras'] = '\t' + wordmap['extras'] if args.verbose: print("Guessed new para: %(new_paras)r" % (wordmap)) print("%(lemma)s\t%(new_paras)r%(extras)s" % (wordmap), file=output) input.close() output.close() exit()
def do_guessing(args): if args.verbose: print("Initialising wordmap from args...", file=stderr) word = init_wordmap() word['lemma'] = args.lemma word['stub'] = args.lemma if args.verbose: print(word['lemma'], file=stderr) if args.pos: word['pos'] = args.pos if args.verbose: print(word['pos'], file=stderr) if args.ktn: word['kotus_tn'] = int(args.ktn) if args.verbose: print(word['kotus_tn'], file=stderr) if args.kav != '0': word['kotus_av'] = args.kav elif args.kav == '0': word['kotus_av'] = False else: word['kotus_av'] = False if args.verbose: print(word['kotus_av'], file=stderr) if args.newpara: word['new_paras'].append(args.newpara) if args.verbose: print(word['new_paras'], file=stderr) if args.plt: word['plurale_tantum'] = True if args.verbose: print('plurale_tantum = True', file=stderr) # this we can guess if args.verbose: print("Guessing pronunciation from lemma", word['lemma'], "...", file=stderr) word = guess_pronunciation(word) if args.verbose: print("Resolved to", word['pronunciation'], file=stderr) print("Guessing harmony from that...", file=stderr) word = guess_harmony(word) if args.verbose: print("Resolved to", word['harmony'], file=stderr) if not word['pos']: if word['kotus_tn']: if args.verbose: print("Guessing pos from ktn...", file=stderr) if word['kotus_tn'] < 52: word['pos'] = 'NOUN' elif word['kotus_tn'] < 79: word['pos'] = 'VERB' else: word['pos'] = 'PARTICLE' else: if args.verbose: print("Guessing pos from lemma...", file=stderr) if word['lemma'].endswith('aa') or word['lemma'].endswith('รครค'): word['pos'] = 'VERB' elif word['lemma'].endswith('sti'): word['pos'] = 'ADVERB' else: word['pos'] = 'NOUN' if word['pos'] in ('ADVERB', 'ADPOSITION', 'CONJUNCTION', 'INTERJECTION'): word['particle'] = word['pos'] word['pos'] = 'PARTICLE' if word['pos'] == 'PARTICLE': word['kotus_tn'] = 99 if not word['new_paras']: if args.verbose: print("Guessing new para from ktn...", file=stderr) word = guess_new_class(word) if not args.output: if args.verbose: print("Following is for lexemes.tsv:") print("%(lemma)s\t%(new_paras)r" % (word)) else: with open(args.output, 'a') as output: print("%(lemma)s\t%(new_paras)r" % (word), file=output)
def main(): ap = argparse.ArgumentParser(description= "Converts Omorfi's lexical data from old kotus-csv format to newpara-tsv " "with possible attribute fields") ap.add_argument("--input", "-i", metavar="INFILE", help="read data from INFILE") ap.add_argument('--output', '-o', metavar="OUTFILE", help="write converted stuff to OUTFILE") ap.add_argument('--plt-file', '-p', metavar="PLTFILE", help="read plurale tantum info (csv) from PLTFILE") ap.add_argument('--verbose', '-v', action="store_true", help="Print verbosely while processing") ap.add_argument("--version", "-V", action="version") args = ap.parse_args() plt_info = defaultdict(lambda: False) if args.plt_file: if args.verbose: print("Reading plurale tantum data from", args.plt_file, file=stderr) with open(args.plt_file, 'r', newline='') as plt_in: headers_skipped = False for line in plt_in: if headers_skipped: lex, plt = line.rsplit(',', 1) plt_info[lex] = plt.strip('"') elif line.find('HEADERS') >= 0: headers_skipped = True if args.input: input = open(args.input, 'r', newline='') else: input = stdin if args.output: output = open(args.output, 'w', newline='') else: output = stdout for line in input: if line.startswith('#') or line.find('<-HEADERS') >= 0: continue fields = line.strip('"\n').split('","') if len(fields) < 4: if len(fields) > 0: if args.verbose: print("Skipping too short line:", line, file=stderr) continue wordmap = init_wordmap() wordmap['stub'] = wordmap['lemma'] = fields[0] if args.verbose: print(wordmap['lemma']) wordmap['kotus_tn'] = int(fields[1]) wordmap['kotus_av'] = fields[2] if wordmap['kotus_av'] == '0': wordmap['kotus_av'] = False wordmap['pos'] = fields[3] wordmap = expand_pos(wordmap) if plt_info: wordmap['plurale_tantum'] = plt_info['"'+'","'.join(fields[0:4])+'"'] for i in range(4, len(fields)): if fields[i].startswith('plt='): wordmap['plurale_tantum'] = fields[i] elif fields[i].startswith('boundaries='): fields[i] = fields[i].replace('|', '{WB}') wordmap['stub'] = wordmap['boundaries'] = fields[i] wordmap = guess_stem_features_ktn(wordmap) wordmap = guess_pronunciation(wordmap) wordmap = guess_grade_dir_from_ktn(wordmap) wordmap = guess_harmony(wordmap) wordmap = guess_new_class(wordmap) wordmap['extras'] = '\t'.join(fields[4:]) if wordmap['extras']: wordmap['extras'] = '\t' + wordmap['extras'] if args.verbose: print("Guessed new para: %(new_paras)r" %(wordmap)) print("%(lemma)s\t%(new_paras)r%(extras)s" %(wordmap), file=output) input.close() output.close() exit()