def field_sort_key(self): if tt.is_kana(self.args.query_s): return ['reading', 'kanji'].index(self.field) elif tt.is_latin(self.args.query_s): if self.args.field == 'reading': # try it converted to kana first return ['reading', 'gloss', 'kanji'].index(self.field) else: # try to interpret as gloss first return ['gloss', 'reading', 'kanji'].index(self.field) else: # doesn't look like kana or latin, probably kanji but who knows return ['kanji', 'reading', 'gloss'].index(self.field)
def run(query): ap = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) ap.add_argument('--version', action='store_true', help='Show version.') ag = ap.add_argument_group('Type of query', '''What field to look in. If not provided, try all of them and return the first to match.''') ag.add_argument('-k', '--kanji', action='store_const', dest='field', const='kanji', default='auto', help='''Return entries matching query on kanji.''') ag.add_argument('-r', '--reading', action='store_const', dest='field', const='reading', help='''Return entries matching query on reading (in kana or rōmaji).''') ag.add_argument('-g', '--gloss', '--meaning', action='store_const', dest='field', const='gloss', help='''Return entries matching query on glosses (English translations/meaning).''') ag = ap.add_argument_group('Query options') ag.add_argument('--case-sensitive', '--sensitive', action='store_true', help='''Case-sensitive search (distinguish uppercase from lowercase). Default: Insensitive, unless there's an uppercase letter in query.''') ag.add_argument('-x', '--regexp', action='store_true', help='''Regular expression search. Extent limits (-e) are respected. Regexps currently don't work for rōmaji; use kana for readings.''') ag.add_argument('-e', '--extent', default='auto', choices=('whole', 'beginning', 'word', 'partial', 'auto'), help='''How much of the field should the query match: - whole: Query must match the entire field. - beginning: Query must match the beginning of the field. - word: Query must match whole word (at present only works for English; treated as 'whole' for kanji or reading fields.) - partial: Query may match anywhere, even partially inside words. - auto (default): Try all four, and return the first to match something.''') ag.add_argument('-w', '--whole', action='store_const', const='whole', dest='extent', help='''Equivalent to --extent=whole.''') ag.add_argument('-b', '--beginning', action='store_const', const='beginning', dest='extent', help='''Equivalent to --extent=beginning.''') ag.add_argument('--word', action='store_const', const='word', dest='extent', help='''Equivalent to --extent=word.''') ag.add_argument('-p', '--partial', action='store_const', const='partial', dest='extent', help='Equivalent to --extent=partial.') ag.add_argument('-f', '--frequent', '-P', action='store_true', help='''Restrict to frequent words (equivalent to EDICT entries marked as ‘(P)’)''') ag = ap.add_argument_group('Output control') ag.add_argument('--output-mode', default='tab', choices=('human', 'tab', 'auto'), help='''Output mode; one of: - human: Multiline human-readable output. - tab: One-line tab-separated. - auto (default): Human if output is to terminal, tab if writing to pipe or file.''') ag.add_argument('-t', '--tsv', '--tab', action='store_const', const='tab', dest='output_mode', help="Equivalent to --output-mode=tab") ag.add_argument('--human', action='store_const', const='human', dest='output_mode', help="Equivalent to --output-mode=human") ag.add_argument('--color', choices=('yes', 'no', 'auto'), default='no', help='''Whether to colorize output. Default 'auto' means to colorize if writing to a terminal.''') ag.add_argument('-c', action='store_const', const='yes', dest='color', help='Equivalent to --color=yes') ag.add_argument('--background', '--bg', choices=('dark', 'light', 'auto'), default='auto', help='''Use colorscheme for dark or light background. Autodetection can be spotty. If it's not working for you, you can also set it in the BACKGROUND environment variable.''') ag.add_argument('--out-hepburn', '--oh', action='store_const', const=romkan.to_hepburn, dest='out_romaji', default=None, help='Convert reading to Hepburn rōmaji in output.') ag.add_argument('--out-kunrei', '--ok', action='store_const', const=romkan.to_kunrei, dest='out_romaji', default=None, help='Convert reading to Kunrei rōmaji in output.') ag = ap.add_argument_group('Abbreviations help') ag.add_argument('--list-abbrevs', action='store_true', help='''List all abbreviations.''') ag.add_argument('-a', '--abbrev', metavar='ABBREV', default=None, help='''Print meaning of an abbreviation.''') ap.add_argument('query', help='Text to look for.', metavar='QUERY', nargs='*') args = ap.parse_args() args.output_mode = 'human' if len(query) == 0: return None args.query = [query] args.color = 'yes' args.background = 'dark' color.use_color = True args.query = ' '.join(args.query) # case sensitivity must be handled before opening db if not args.case_sensitive: if re.search("[A-Z]", args.query): args.case_sensitive = True if not config: print('%s: Could not find config.ini!' % fmt('ERROR', 'error')) # print version regardless if args.version: print(common.version(None)) sys.exit(2) # try to open database try: con, cur = database.opendb(case_sensitive=args.case_sensitive) except database.DatabaseAccessError as e: print('''Database error: %s. Expected database version %s at: %s Before using myougiden for the first time, you need to compile the JMdict (EDICT) dictionary. Try running this command to download and compile it: updatedb-myougiden -f It will take a while, but lookups afterwards will be fast. JMdict is frequently updated. If you'd like to keep up with new entries, you might want to add the update command to cron (for example, in /etc/cron.weekly/myougiden ).''' % (str(e), config.get('core', 'dbversion'), config.get('paths', 'database'))) if args.version: print() print(common.version(None)) sys.exit(2) # handle short commands first. if args.version: print(common.version(cur)) sys.exit(0) elif args.list_abbrevs: print(orm.abbrevs_table(cur)) sys.exit(0) elif args.abbrev: a = orm.abbrev_line(cur, args.abbrev) if a: print(a) sys.exit(0) else: print('Not found!') sys.exit(0) # handle query guesswork if args.query == '': ap.print_help() sys.exit(2) # 'word' doesn't work for Jap. anyway, and 'whole' is much faster. if args.extent == 'word' and args.field in ('kanji', 'reading'): args.extent = 'whole' # first, we need a dictionary of options with only keys understood # by search_by(). search_args = vars(args).copy() # turn Namespace to dict # keep only interesting keys for k in list(search_args.keys()): if k not in ('field', 'query', 'extent', 'regexp', 'case_sensitive', 'frequent'): del search_args[k] # we'll iterate over all required 'field' and 'extent' conditions. # # for code clarity, we always use a list of search conditions, # even if the size of the list is 1. if args.field == 'auto': if tt.is_latin(args.query): # if pure alphabet, try as English first, then as rōmaji fields = ('gloss', 'reading', 'kanji') elif tt.is_romaji(args.query): # latin with special chars; probably rōmaji fields = ('reading', 'gloss', 'kanji') elif tt.is_kana(args.query): fields = ('reading', 'kanji', 'gloss') else: fields = ('kanji', 'reading', 'gloss') else: fields = (args.field,) if args.extent != 'auto': extents = (args.extent,) else: extents = ('whole', 'word', 'partial') if args.regexp: regexp_flags = (True,) elif tt.has_regexp_special(args.query): regexp_flags = (False, True) else: regexp_flags = (False,) conditions = [] for regexp in regexp_flags: for extent in extents: for field in fields: # the useless combination; we'll avoid it to avoid wasting # time. if extent == 'word' and field != 'gloss': if args.extent == 'auto': # we're trying all possibilities, so we can just # skip this one. other extents were/will be tried # elsewhen in the loop. continue else: # not trying all possibilities; this is our only # pass in this field, so let's adjust it. sa = search_args.copy() sa['extent'] = 'whole' else: # simple case. sa = search_args.copy() sa['extent'] = extent sa['field'] = field sa['regexp'] = regexp conditions.append(sa) # deal with rōmaji queries if (args.field in ('auto', 'reading') and tt.is_romaji(args.query)): if re.search('[A-Z]', args.query): kana_guess = (romkan.to_katakana, romkan.to_hiragana) else: kana_guess = (romkan.to_hiragana, romkan.to_katakana) new_conditions = conditions[:] for oldcond in conditions: if oldcond['field'] == 'reading': for kanafn in kana_guess: # the query looks like romaji and the field is reading. # so we try it converted to kana _first_, then try as-is. # thus the insert. for romaji in tt.expand_romaji(oldcond['query']): newcond = oldcond.copy() newcond['query'] = kanafn(romaji) new_conditions.insert(new_conditions.index(oldcond), newcond) conditions = new_conditions chosen_search, ent_seqs = search.guess(cur, conditions) if chosen_search: entries = [orm.fetch_entry(cur, ent_seq) for ent_seq in ent_seqs] if args.output_mode == 'human': out = [entry.format_human(search_params=chosen_search, romajifn=args.out_romaji) for entry in entries] out = ("\n\n".join(out)) + "\n" elif args.output_mode == 'tab': out = [entry.format_tsv(search_params=chosen_search, romajifn=args.out_romaji) for entry in entries] # out = ("\n".join(out)) + "\n" return out else: return None
def generate_search_conditions(args): '''args = command-line argument dict (argparse object)''' if args.regexp: regexp_flags = (True,) elif tt.has_regexp_special(args.query_s): regexp_flags = (False, True) else: regexp_flags = (False,) if args.field != 'auto': fields = (args.field,) else: if tt.is_kana(args.query_s): fields = ('kanji', 'reading') else: fields = ('kanji', 'reading', 'gloss') if args.extent != 'auto': extents = (args.extent,) else: extents = ('whole', 'word', 'beginning', 'partial') conditions = [] for regexp in regexp_flags: for field in fields: for extent in extents: if field == 'gloss' and extent == 'beginning' and args.extent == 'auto': # when we search for e.g. 'man' in auto guesses, we # typically don't want 'manatee' but not 'humanity' continue elif field in ('kanji', 'reading') and extent == 'word': if args.extent == 'auto': # useless combination generated, skip continue else: # useless combination requested, adjust extent = 'whole' if field == 'reading' and tt.is_latin(args.query_s): # 'reading' field auto-convert romaji to kana. as of this # writing, JMdict has no romaji in readingfields. queries = ([romkan.to_hiragana(s) for s in args.query], [romkan.to_katakana(s) for s in args.query]) # romkan will convert ASCII hyphen-minus to CJKV long 'ー' # we back-convert it in start position, to preserve FTS # operator '-'. def fix_hyphen(s): if len(s) > 1 and s[0] == 'ー': s = '-' + s[1:] return s queries = [[fix_hyphen(s) for s in query] for query in queries] else: queries = (args.query,) # TODO: add wide-char for query in queries: conditions.append(SearchConditions( args, query, regexp, field, extent)) return conditions
def run(query): ap = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) ap.add_argument('--version', action='store_true', help='Show version.') ag = ap.add_argument_group( 'Type of query', '''What field to look in. If not provided, try all of them and return the first to match.''') ag.add_argument('-k', '--kanji', action='store_const', dest='field', const='kanji', default='auto', help='''Return entries matching query on kanji.''') ag.add_argument('-r', '--reading', action='store_const', dest='field', const='reading', help='''Return entries matching query on reading (in kana or rōmaji).''') ag.add_argument('-g', '--gloss', '--meaning', action='store_const', dest='field', const='gloss', help='''Return entries matching query on glosses (English translations/meaning).''') ag = ap.add_argument_group('Query options') ag.add_argument('--case-sensitive', '--sensitive', action='store_true', help='''Case-sensitive search (distinguish uppercase from lowercase). Default: Insensitive, unless there's an uppercase letter in query.''') ag.add_argument('-x', '--regexp', action='store_true', help='''Regular expression search. Extent limits (-e) are respected. Regexps currently don't work for rōmaji; use kana for readings.''') ag.add_argument('-e', '--extent', default='auto', choices=('whole', 'beginning', 'word', 'partial', 'auto'), help='''How much of the field should the query match: - whole: Query must match the entire field. - beginning: Query must match the beginning of the field. - word: Query must match whole word (at present only works for English; treated as 'whole' for kanji or reading fields.) - partial: Query may match anywhere, even partially inside words. - auto (default): Try all four, and return the first to match something.''') ag.add_argument('-w', '--whole', action='store_const', const='whole', dest='extent', help='''Equivalent to --extent=whole.''') ag.add_argument('-b', '--beginning', action='store_const', const='beginning', dest='extent', help='''Equivalent to --extent=beginning.''') ag.add_argument('--word', action='store_const', const='word', dest='extent', help='''Equivalent to --extent=word.''') ag.add_argument('-p', '--partial', action='store_const', const='partial', dest='extent', help='Equivalent to --extent=partial.') ag.add_argument('-f', '--frequent', '-P', action='store_true', help='''Restrict to frequent words (equivalent to EDICT entries marked as ‘(P)’)''') ag = ap.add_argument_group('Output control') ag.add_argument('--output-mode', default='tab', choices=('human', 'tab', 'auto'), help='''Output mode; one of: - human: Multiline human-readable output. - tab: One-line tab-separated. - auto (default): Human if output is to terminal, tab if writing to pipe or file.''') ag.add_argument('-t', '--tsv', '--tab', action='store_const', const='tab', dest='output_mode', help="Equivalent to --output-mode=tab") ag.add_argument('--human', action='store_const', const='human', dest='output_mode', help="Equivalent to --output-mode=human") ag.add_argument( '--color', choices=('yes', 'no', 'auto'), default='no', help='''Whether to colorize output. Default 'auto' means to colorize if writing to a terminal.''') ag.add_argument('-c', action='store_const', const='yes', dest='color', help='Equivalent to --color=yes') ag.add_argument('--background', '--bg', choices=('dark', 'light', 'auto'), default='auto', help='''Use colorscheme for dark or light background. Autodetection can be spotty. If it's not working for you, you can also set it in the BACKGROUND environment variable.''') ag.add_argument('--out-hepburn', '--oh', action='store_const', const=romkan.to_hepburn, dest='out_romaji', default=None, help='Convert reading to Hepburn rōmaji in output.') ag.add_argument('--out-kunrei', '--ok', action='store_const', const=romkan.to_kunrei, dest='out_romaji', default=None, help='Convert reading to Kunrei rōmaji in output.') ag = ap.add_argument_group('Abbreviations help') ag.add_argument('--list-abbrevs', action='store_true', help='''List all abbreviations.''') ag.add_argument('-a', '--abbrev', metavar='ABBREV', default=None, help='''Print meaning of an abbreviation.''') ap.add_argument('query', help='Text to look for.', metavar='QUERY', nargs='*') args = ap.parse_args() args.output_mode = 'human' if len(query) == 0: return None args.query = [query] args.color = 'yes' args.background = 'dark' color.use_color = True args.query = ' '.join(args.query) # case sensitivity must be handled before opening db if not args.case_sensitive: if re.search("[A-Z]", args.query): args.case_sensitive = True if not config: print('%s: Could not find config.ini!' % fmt('ERROR', 'error')) # print version regardless if args.version: print(common.version(None)) sys.exit(2) # try to open database try: con, cur = database.opendb(case_sensitive=args.case_sensitive) except database.DatabaseAccessError as e: print('''Database error: %s. Expected database version %s at: %s Before using myougiden for the first time, you need to compile the JMdict (EDICT) dictionary. Try running this command to download and compile it: updatedb-myougiden -f It will take a while, but lookups afterwards will be fast. JMdict is frequently updated. If you'd like to keep up with new entries, you might want to add the update command to cron (for example, in /etc/cron.weekly/myougiden ).''' % (str(e), config.get( 'core', 'dbversion'), config.get('paths', 'database'))) if args.version: print() print(common.version(None)) sys.exit(2) # handle short commands first. if args.version: print(common.version(cur)) sys.exit(0) elif args.list_abbrevs: print(orm.abbrevs_table(cur)) sys.exit(0) elif args.abbrev: a = orm.abbrev_line(cur, args.abbrev) if a: print(a) sys.exit(0) else: print('Not found!') sys.exit(0) # handle query guesswork if args.query == '': ap.print_help() sys.exit(2) # 'word' doesn't work for Jap. anyway, and 'whole' is much faster. if args.extent == 'word' and args.field in ('kanji', 'reading'): args.extent = 'whole' # first, we need a dictionary of options with only keys understood # by search_by(). search_args = vars(args).copy() # turn Namespace to dict # keep only interesting keys for k in list(search_args.keys()): if k not in ('field', 'query', 'extent', 'regexp', 'case_sensitive', 'frequent'): del search_args[k] # we'll iterate over all required 'field' and 'extent' conditions. # # for code clarity, we always use a list of search conditions, # even if the size of the list is 1. if args.field == 'auto': if tt.is_latin(args.query): # if pure alphabet, try as English first, then as rōmaji fields = ('gloss', 'reading', 'kanji') elif tt.is_romaji(args.query): # latin with special chars; probably rōmaji fields = ('reading', 'gloss', 'kanji') elif tt.is_kana(args.query): fields = ('reading', 'kanji', 'gloss') else: fields = ('kanji', 'reading', 'gloss') else: fields = (args.field, ) if args.extent != 'auto': extents = (args.extent, ) else: extents = ('whole', 'word', 'partial') if args.regexp: regexp_flags = (True, ) elif tt.has_regexp_special(args.query): regexp_flags = (False, True) else: regexp_flags = (False, ) conditions = [] for regexp in regexp_flags: for extent in extents: for field in fields: # the useless combination; we'll avoid it to avoid wasting # time. if extent == 'word' and field != 'gloss': if args.extent == 'auto': # we're trying all possibilities, so we can just # skip this one. other extents were/will be tried # elsewhen in the loop. continue else: # not trying all possibilities; this is our only # pass in this field, so let's adjust it. sa = search_args.copy() sa['extent'] = 'whole' else: # simple case. sa = search_args.copy() sa['extent'] = extent sa['field'] = field sa['regexp'] = regexp conditions.append(sa) # deal with rōmaji queries if (args.field in ('auto', 'reading') and tt.is_romaji(args.query)): if re.search('[A-Z]', args.query): kana_guess = (romkan.to_katakana, romkan.to_hiragana) else: kana_guess = (romkan.to_hiragana, romkan.to_katakana) new_conditions = conditions[:] for oldcond in conditions: if oldcond['field'] == 'reading': for kanafn in kana_guess: # the query looks like romaji and the field is reading. # so we try it converted to kana _first_, then try as-is. # thus the insert. for romaji in tt.expand_romaji(oldcond['query']): newcond = oldcond.copy() newcond['query'] = kanafn(romaji) new_conditions.insert(new_conditions.index(oldcond), newcond) conditions = new_conditions chosen_search, ent_seqs = search.guess(cur, conditions) if chosen_search: entries = [orm.fetch_entry(cur, ent_seq) for ent_seq in ent_seqs] if args.output_mode == 'human': out = [ entry.format_human(search_params=chosen_search, romajifn=args.out_romaji) for entry in entries ] out = ("\n\n".join(out)) + "\n" elif args.output_mode == 'tab': out = [ entry.format_tsv(search_params=chosen_search, romajifn=args.out_romaji) for entry in entries ] # out = ("\n".join(out)) + "\n" return out else: return None
def generate_search_conditions(args): '''args = command-line argument dict (argparse object)''' if args.regexp: regexp_flags = (True,) elif tt.has_regexp_special(args.query_s): regexp_flags = (False, True) else: regexp_flags = (False,) if args.field != 'auto': fields = (args.field,) else: if tt.is_kana(args.query_s): fields = ('kanji', 'reading') else: fields = ('kanji', 'reading', 'gloss') if args.extent != 'auto': extents = (args.extent,) else: extents = ('whole', 'word', 'beginning', 'partial') conditions = [] for regexp in regexp_flags: for field in fields: for extent in extents: if field == 'gloss' and extent == 'beginning' and args.extent == 'auto': # when we search for e.g. 'man' in auto guesses, we # typically don't want 'manatee' but not 'humanity' continue elif field in ('kanji', 'reading') and extent == 'word': if args.extent == 'auto': # useless combination generated, skip continue else: # useless combination requested, adjust extent = 'whole' if field == 'reading' and tt.is_latin(args.query_s): # 'reading' field auto-convert romaji to kana. as of this # writing, JMdict has no romaji in readingfields. queries = ([romkan.to_hiragana(s) for s in args.query], [romkan.to_katakana(s) for s in args.query]) # romkan will convert ASCII hyphen-minus to CJKV long 'ー' # we back-convert it in start position, to preserve FTS # operator '-'. def fix_hyphen(s): if len(s) > 1 and s[0] == 'ー': s = '-' + s[1:] return s queries = [[fix_hyphen(s) for s in query] for query in queries] else: queries = (args.query,) # TODO: add wide-char for query in queries: conditions.append(SearchConditions(args, query, regexp, field, extent)) return conditions