def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global filetype_corpus_ext global filetype_candidates_ext global output_filetype_ext global action_annotate global action_filter treat_options_simplest(opts, arg, n_arg, usage_string) detector_class = ContiguousLemmaDetector candidates_fnames = [] n_gaps = None for (o, a) in opts: if o in ("-c", "--candidates"): candidates_fnames.append(a) elif o in ("-d", "--detector"): detector_class = detectors.get(a,None) if detector_class is None : error("Unkown detector name: "+a) elif o in ("-S", "--source"): detector_class = SourceDetector elif o in ("-g", "--gaps"): n_gaps = int(a) elif o == "--corpus-from": filetype_corpus_ext = a elif o == "--candidates-from": filetype_candidates_ext = a elif o == "--to": output_filetype_ext = a elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global filetype_corpus_ext global filetype_candidates_ext global output_filetype_ext global action_annotate global action_filter treat_options_simplest(opts, arg, n_arg, usage_string) detector_class = ContiguousLemmaDetector candidates_fnames = [] n_gaps = None for (o, a) in opts: if o in ("-c", "--candidates"): candidates_fnames.append(a) elif o in ("-d", "--detector"): detector_class = detectors.get(a, None) if detector_class is None: error("Unkown detector name: " + a) elif o in ("-S", "--source"): detector_class = SourceDetector elif o in ("-g", "--gaps"): n_gaps = int(a) elif o == "--corpus-from": filetype_corpus_ext = a elif o == "--candidates-from": filetype_candidates_ext = a elif o == "--to": output_filetype_ext = a elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global pre_gs global ignore_pos global gs_name global ignore_case global lemma_or_surface global input_filetype_ext global reference_filetype_ext ref_name = None treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-r", "--reference"): ref_name = a elif o in ("-g", "--ignore-pos"): ignore_pos = True elif o in ("-c", "--case"): ignore_case = False elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name: filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name)) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs: error("You MUST provide a non-empty reference list!")
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global pre_gs global ignore_pos global gs_name global ignore_case global lemma_or_surface global input_filetype_ext global reference_filetype_ext ref_name = None treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-r", "--reference"): ref_name = a elif o in ("-g", "--ignore-pos"): ignore_pos = True elif o in ("-c", "--case"): ignore_case = False elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name : filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) ) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs : error("You MUST provide a non-empty reference list!")
def main(corpus_paths): """ Main function. """ global use_shelve, ngram_counts, selected_candidates # Dummy file initialization to avoid warnings in PyCharm ngram_counts_tmpfile = selected_candidates_tmpfile = None if use_shelve: verbose("Making temporary file...") (ngram_counts, ngram_counts_tmpfile) = make_shelve() (selected_candidates, selected_candidates_tmpfile) = make_shelve() verbose("Counting ngrams...") filetype.parse(corpus_paths, NGramCounterHandler(), input_filetype_ext) if use_shelve: verbose("Removing temporary files...") destroy_shelve(ngram_counts, ngram_counts_tmpfile) destroy_shelve(selected_candidates, selected_candidates_tmpfile)
print_cand_freq = True elif o in ("-i", "--index") : input_filetype_ext = "BinaryIndex" warn("Option -i is deprecated; use --from=BinaryIndex") elif o == "--id-order": id_order = a.split(":") elif o == "--from" : input_filetype_ext = a elif o == "--to" : output_filetype_ext = a else: raise Exception("Bad flag") if non_overlapping and match_distance == "All": # If we are taking all matches, we need to be able to overlap... error("Conflicting options: --match-distance=All and --non-overlapping") if len(mode) != 1 : error("Exactly one option, -p or -n, must be provided") if "patterns" in mode: global patterns patterns = filetype.parse_entities([patterns_file]) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "patterns=", "ngram=", "index", "match-distance=", "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order=" ] arg = read_options( "p:n:id:NfgsS", longopts, treat_options, -1, usage_string ) filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
""" global combination global supported_combination global main_freq treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-c", "--combination"): try: combination = [] combination = interpret_combinations(a) except ValueError as message: print >> sys.stderr, message print >> sys.stderr, "ERROR: argument must be list separated"+ \ "by \":\" and containing the names: "+\ str( supported_combination ) usage(usage_string) sys.exit(2) elif o in ("-o", "--original"): main_freq = a ################################################################################ # MAIN SCRIPT longopts = ["combination=", "original="] args = read_options("c:o:", longopts, treat_options, -1, usage_string) filetype.parse(args, FreqCombinerHandler())
for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one"+\ " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f") ################################################################################ # MAIN SCRIPT longopts = ["from=", "feat=", "asc", "desc", "precs"] args = read_options("f:adp", longopts, treat_options, 1, usage_string) filetype.parse(args, StatsCollectorHandler(), input_filetype_ext) print_stats()
@param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global lemmapos global input_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True elif o in ("-p", "--lemmapos") : lemmapos = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "surface", "lemmapos", "from=" ] args = read_options( "sp", longopts, treat_options, -1, usage_string ) handler = ft_csv.CSVPrinter("candidates", lemmapos=lemmapos, surfaces=surface_instead_lemmas) filetype.parse(args, handler, input_filetype_ext)
global limit global entity_buffer global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-n", "--number"): try: limit = int(a) entity_buffer = [None] * limit if limit < 0: raise ValueError except ValueError: error("You must provide a positive " + \ "integer value as argument of -n option.") else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT args = read_options("n:", ["from=", "to=", "number="], treat_options, -1, usage_string) filetype.parse(args, TailPrinterHandler(limit), input_filetype_ext)
"""Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string Instructions that appear if you run the program with the wrong parameters or options. """ global sent_split global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for ( o, a ) in opts: if o in ("-s", "--sentence"): sent_split = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["sentence=", "to="] args = read_options("s:", longopts, treat_options, -1, usage_string) handler = filetype.AutomaticPrinterHandler(output_filetype_ext) parser = ft_treetagger.TreeTaggerParser("utf-8", sent_split) filetype.parse(args, handler, parser=parser)
ignore_pos = True elif o in ("-c", "--case"): ignore_case = False elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name : filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) ) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs : error("You MUST provide a non-empty reference list!") ################################################################################ # MAIN SCRIPT longopts = ["input-from=", "reference-from=", "reference=", "ignore-pos", "case", "lemma-or-surface"] args = read_options( "r:gcL", longopts, treat_options, -1, usage_string ) filetype.parse(args, EvaluatorHandler(), input_filetype_ext)
@param n_arg The number of arguments expected for this script. """ global web_freq treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] for ( o, a ) in opts: if o in ( "-y", "--yahoo" ): web_freq = YahooFreq() mode.append("yahoo") elif o in ( "-w", "--google" ): web_freq = GoogleFreq() mode.append("google") if len(mode) > 1: error("At most one option -y or -w, should be provided") ################################################################################ # MAIN SCRIPT longopts = ["google", "yahoo"] args = read_options("wy", longopts, treat_options, -1, usage_string) try: filetype.parse(args, LemmatiserHandler()) finally: if web_freq: web_freq.flush_cache()
self.chain = self.make_printer(info, output_filetype_ext) self.chain.before_file(fileobj, info) def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("--from"): input_filetype_ext = a elif o in ("--to"): output_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to="] args = read_options("", longopts, treat_options, -1, usage_string) filetype.parse(args, ConverterHandler(), input_filetype_ext)
@param n_arg The number of arguments expected for this script. """ global web_freq treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] for (o, a) in opts: if o in ("-y", "--yahoo"): web_freq = YahooFreq() mode.append("yahoo") elif o in ("-w", "--google"): web_freq = GoogleFreq() mode.append("google") if len(mode) > 1: error("At most one option -y or -w, should be provided") ################################################################################ # MAIN SCRIPT longopts = ["google", "yahoo"] args = read_options("wy", longopts, treat_options, -1, usage_string) try: filetype.parse(args, LemmatiserHandler()) finally: if web_freq: web_freq.flush_cache()
else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided") #elif text_input and web_freq is None: # warn("-x option is recommended for web queries, not textual indices") ################################################################################ # MAIN SCRIPT longopts = [ "candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=", "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams", "univ=", "web1t=" ] args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1, usage_string) try: verbose("Counting ngrams in candidates file") filetype.parse(args, CounterPrinter(), filetype_candidates_ext) finally: if web_freq: web_freq.flush_cache() # VERY IMPORTANT!
################################################################################ def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("--from"): input_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["from="] args = read_options("", longopts, treat_options, -1, usage_string) relation_name = "stdin" if len(args) == 0 else args[0].replace(".xml", "") filetype.parse(args, filetype.printer_class("ARFF")("corpus", relation_name=relation_name), input_filetype_ext)
elif o in ("-p", "--patterns"): input_patterns = filetype.parse_entities([a]) elif o in ("-d", "--match-distance") : match_distance = a elif o in ("-N", "--non-overlapping") : non_overlapping = True elif o == "--id-order": id_order = a.split(":") elif o == "--annotate": annotate = True elif o == "--only-matching": only_the_matching_subpart = True else: raise Exception("Bad arg " + o) if input_patterns is None: util.error("No patterns provided. Option --patterns is mandatory!") if only_the_matching_subpart and annotate: util.warn("Switch --only-matching disables --annotate") ################################################################################ # MAIN SCRIPT longopts = ["input-from=", "to=", "patterns=", "match-distance=", "non-overlapping=", "id-order=", "annotate", "only-matching"] args = util.read_options("p:d:N", longopts, treat_options, -1, usage_string) filetype.parse(args, GrepHandler(), input_filetype_ext)
@param n_arg The number of arguments expected for this script. """ global executable_w global executable_beg global executable_end global input_filetype_ext global output_filetype_ext util.treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o == "--begin": executable_beg = compile(a, "<cmdline:--begin>", "exec") elif o == "--end": executable_end = compile(a, "<cmdline:--end>", "exec") elif o in ("-w", "--each-word"): executable_w = compile(a, "<cmdline:--each-word>", "exec") else: raise Exception("Bad arg " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "begin=", "end=", "each-word="] args = util.read_options("w:", longopts, treat_options, -1, usage_string) filetype.parse(args, TransformHandler(), input_filetype_ext)
@param n_arg The number of arguments expected for this script. """ global ignore_pos global surface_instead_lemmas global input_filetype_ext global output_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-g", "--ignore-pos") : ignore_pos = True elif o in ("-s", "--surface") : surface_instead_lemmas = True else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "ignore-pos", "surface" ] args = read_options( "gst", longopts, treat_options, -1, usage_string ) filetype.parse(args, UniqerHandler(), input_filetype_ext)
treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-l","--lemmas" ) : lower_attr = "lemma" elif o in ("-a", "--algorithm"): algoname = a.lower() elif o in ("-m", "-x"): error( "Deprecated options -x and -m. Run with -h for details" ) else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "algorithm=", "lemmas" ] args = read_options( "a:xml", longopts, treat_options, 1, usage_string ) if algoname != "simple" : verbose( "Pass 1: Reading vocabulary from file... please wait" ) filetype.parse(args, VocabReaderHandler(), input_filetype_ext) verbose( "Pass 2: Lowercasing the words in the file" ) filetype.parse(args, LowercaserHandler(), input_filetype_ext)
@param n_arg The number of arguments expected for this script. """ global limit global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-n", "--number"): try: limit = int( a ) if limit < 0: raise ValueError except ValueError: error("You must provide a positive " \ "integer value as argument of -n option.") else: raise Exception("Bad arg") ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "number="] args = read_options("n:", longopts, treat_options, -1, usage_string) filetype.parse(args, HeadPrinterHandler(limit), input_filetype_ext)
if algoname == "simple" : # Redundant, kept for clarity sent_handler = LowercaserHandler.handle_sentence_simple elif algoname == "complex" : sent_handler = LowercaserHandler.handle_sentence_complex elif algoname == "aggressive" : # Redundant, kept for clarity sent_handler = LowercaserHandler.handle_sentence_aggressive else : ctxinfo.error("Bad algorithm name `{name}`", name=algoname) elif o == "-m": ctxinfo.error("Deprecated option. Use --from=Moses instead" ) elif o == "-x": ctxinfo.error("Deprecated option. " \ "Use --from=PlainCorpus instead") else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "algorithm=", "lemmas" ] args = util.read_options( "a:xml", longopts, treat_options, 1, usage_string ) if sent_handler != LowercaserHandler.handle_sentence_simple : util.verbose( "Pass 1: Reading vocabulary from file... please wait" ) filetype.parse(args, VocabReaderHandler(), input_filetype_ext) util.verbose( "Pass 2: Lowercasing the words in the file" ) filetype.parse(args, LowercaserHandler(), input_filetype_ext)
a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f") ################################################################################ # MAIN SCRIPT longopts = ["from=", "feat=", "asc", "desc", "precs"] args = read_options("f:adp", longopts, treat_options, 1, usage_string) filetype.parse(args, StatsCollectorHandler(), input_filetype_ext) print_stats()
self.add("\n") self.add(handled_type, ":\n") self.handled_type = handled_type self.counter = 0 ########################################################### def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global reference_fname global mwe_evaluator treat_options_simplest(opts, arg, n_arg, usage_string) ################################################################################ # MAIN SCRIPT if __name__ == "__main__": longopts = [] args = read_options("", longopts, treat_options, -1, usage_string) parse(args, PrettyPrinterHandler())
@param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global lemmapos global input_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True elif o in ("-p", "--lemmapos") : lemmapos = True elif o in ("-f", "--freq-source") : freq_source = a elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "surface", "lemmapos", "freq-source=" "from=" ] args = read_options( "spf:", longopts, treat_options, -1, usage_string ) handler = ft_ucs.UCSPrinter("candidates", freq_source=freq_source, lemmapos=lemmapos, surfaces=surface_instead_lemmas) filetype.parse(args, handler, input_filetype_ext)
@param n_arg The number of arguments expected for this script. """ global ignore_pos global surface_instead_lemmas global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-g", "--ignore-pos"): ignore_pos = True elif o in ("-s", "--surface"): surface_instead_lemmas = True else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "ignore-pos", "surface"] args = read_options("gst", longopts, treat_options, -1, usage_string) filetype.parse(args, UniqerHandler(), input_filetype_ext)
output_filetype_ext = a elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps) ################################################################################ # MAIN SCRIPT longopts = [ "corpus-from=", "candidates-from=", "to=", "candidates=", "detector=", "gaps=", "source", "filter", "filter-and-annot" ] arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string) filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
@param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string for the current script. """ global attributes treat_options_simplest( opts, arg, n_arg, usage_string ) for (o, a) in opts: if o in ("-a", "--attributes"): attributes = a.split(":") for attr in attributes: if attr not in WORD_ATTRIBUTES: error("Unknown attribute '%s'!" % attr) if attributes is None: print >>sys.stderr, "The option -a <attributes> is mandatory." usage(usage_string) sys.exit(2) ################################################################################ # MAIN SCRIPT longopts = ["atttibutes="] arg = read_options("a:", longopts, treat_options, -1, usage_string) filetype.parse(arg, TxtGeneratorHandler())
################################################################################ def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global limit treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-n", "--number") : try : limit = int( a ) if limit < 0 : raise ValueError except ValueError : error("You must provide a positive integer value as argument " "of -n option.") ################################################################################ # MAIN SCRIPT longopts = [ "number=" ] args = read_options( "n:", longopts, treat_options, -1, usage_string ) filetype.parse(args, HistogramGeneratorHandler())
verb_table[ "google" ].sort( key=operator.itemgetter(3), reverse=True ) verb_table["google"] = verb_table["google"][ 0:5 ] compl_table[ "google" ].sort( key=operator.itemgetter(1), reverse=True ) compl_table["google"] = compl_table["google"][ 0:5 ] ent = entropy( probs_from_varfreqs( map( operator.itemgetter(0), freq_table["google"] ) ) ) ent_w = entropy( probs_weighted( map( operator.itemgetter(0), freq_table["google"] ), map( operator.itemgetter(1,2,3), freq_table["google"] ) ) ) ent_w_verb = entropy( probs_weighted( map( operator.itemgetter(0), compl_table["google"] ), map( operator.itemgetter(1,2,3), compl_table["google"] ) ) ) ent_w_compl = entropy( probs_weighted( map( operator.itemgetter(0), verb_table["google"] ), map( operator.itemgetter(1,2,3), verb_table["google"] ) ) ) candidate.add_feat( Feature( "entropy_google", str( ent ) ) ) candidate.add_feat( Feature( "entropy_w_google", str( ent_w ) ) ) candidate.add_feat( Feature( "entropy_w_verb_google", str( ent_w_verb ) ) ) candidate.add_feat( Feature( "entropy_w__compl_google", str( ent_w_compl ))) self.chain.handle_candidate(candidate, info) ################################################################################ # MAIN SCRIPT longopts = [] args = read_options( "", longopts, treat_options_simplest, -1, usage_string ) filetype.parse(args, FeatGeneratorHandler())
if o in ("--from"): input_filetype_ext = a elif o in ("--to"): output_filetype_ext = a elif o == "--keep-empty-words": keep_empty_words = True elif o == "--word-lemmas": take_lemma = True elif o == "--word-lemmas-matching": regex_word_lemma = a elif o == "--word-surfaces-matching": regex_word_surface = a elif o == "--word-pos-matching": regex_word_pos = a elif o == "--word-syn-matching": regex_word_syn = a else: raise Exception("Bad arg") ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "keep-empty-words", "word-lemmas", "word-lemmas-matching=", "word-surfaces-matching=", "word-pos-matching=", "word-syn-matching="] args = read_options("", longopts, treat_options, -1, usage_string) printer = SelectorPrinterHandler() filetype.parse(args, printer, input_filetype_ext)
def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string Instructions that appear if you run the program with the wrong parameters or options. """ global sent_split global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-s", "--sentence"): sent_split = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["sentence=", "to="] args = read_options("s:", longopts, treat_options, -1, usage_string) handler = filetype.AutomaticPrinterHandler(output_filetype_ext) parser = ft_treetagger.TreeTaggerParser("utf-8", sent_split) filetype.parse(args, handler, parser=parser)
################################################################################ def treat_options(opts, arg, n_arg, usage_string): """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global limit treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-n", "--number"): try: limit = int(a) if limit < 0: raise ValueError except ValueError: error("You must provide a positive integer value as argument " "of -n option.") ################################################################################ # MAIN SCRIPT longopts = ["number="] args = read_options("n:", longopts, treat_options, -1, usage_string) filetype.parse(args, HistogramGeneratorHandler())
@param n_arg The number of arguments expected for this script. """ global combination global supported_combination global main_freq treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ( "-c", "--combination" ) : try : combination = [] combination = interpret_combinations( a ) except ValueError as message : print >> sys.stderr, message print >> sys.stderr, "ERROR: argument must be list separated"+ \ "by \":\" and containing the names: "+\ str( supported_combination ) usage( usage_string ) sys.exit( 2 ) elif o in ( "-o", "--original" ) : main_freq = a ################################################################################ # MAIN SCRIPT longopts = [ "combination=", "original=" ] args = read_options( "c:o:", longopts, treat_options, -1, usage_string ) filetype.parse(args, FreqCombinerHandler())
owl_cand.append(form) owl_cand = "_".join(owl_cand) + "\"/>\n" self.add_string(owl_cand) def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas treat_options_simplest( opts, arg, n_arg, usage_string ) mode = [] for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "surface" ] args = read_options( "s", longopts, treat_options, -1, usage_string ) filetype.parse(args, OwlPrinter("candidates"))
for ( o, a ) in opts: if o in ( "-m", "--measures" ) : try : measures = [] measures = interpret_measures( a ) except ValueError as message : error( str(message)+"\nargument must be list separated by " "\":\" and containing the names: "+ str( supported_measures )) elif o in ( "-o", "--original" ) : main_freq_name = a elif o in ( "-a", "--all" ) : join_all_contrastive = True if not main_freq_name : error( "Option -o is mandatory") ################################################################################ # MAIN SCRIPT longopts = ["measures=", "original=", "all"] args = read_options( "m:o:a", longopts, treat_options, 1, usage_string ) for a in args : verbose( "Pass 1 for " + a ) filetype.parse([a], TotalCalculatorHandler()) # First calculate Nc for each contrastive corpus verbose( "Pass 2 for " + a ) filetype.parse([a], MeasureCalculatorHandler())
suffix_array = index.load("lemma+pos") else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided") #elif text_input and web_freq is None: # warn("-x option is recommended for web queries, not textual indices") ################################################################################ # MAIN SCRIPT longopts = ["candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=", "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams", "univ=", "web1t="] args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1, usage_string) try: verbose("Counting ngrams in candidates file") filetype.parse(args, CounterPrinter(), filetype_candidates_ext) finally: if web_freq: web_freq.flush_cache() # VERY IMPORTANT!
""" global executable_w global executable_beg global executable_end global input_filetype_ext global output_filetype_ext util.treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o == "--begin": executable_beg = compile(a, "<cmdline:--begin>", "exec") elif o == "--end": executable_end = compile(a, "<cmdline:--end>", "exec") elif o in ("-w", "--each-word"): executable_w = compile(a, "<cmdline:--each-word>", "exec") else: raise Exception("Bad arg " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "begin=", "end=", "each-word="] args = util.read_options("w:", longopts, treat_options, -1, usage_string) filetype.parse(args, TransformHandler(), input_filetype_ext)
elif o == "--id-order": id_order = a.split(":") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad flag") if non_overlapping and match_distance == "All": # If we are taking all matches, we need to be able to overlap... error( "Conflicting options: --match-distance=All and --non-overlapping") if len(mode) != 1: error("Exactly one option, -p or -n, must be provided") if "patterns" in mode: global patterns patterns = filetype.parse_entities([patterns_file]) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "patterns=", "ngram=", "index", "match-distance=", "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order=" ] arg = read_options("p:n:id:NfgsS", longopts, treat_options, -1, usage_string) filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
INFO = EvitaInfo() class EvitaPrinter(filetype.common.AbstractPrinter): filetype_info = INFO valid_categories = ["candidates"] def handle_candidate(self, candidate, info={}): """For each `Candidate`, print the candidate ID, its POS pattern and the list of occurrences one per line @param candidate The `Candidate` that is being read from the XML file. """ pos = candidate.get_pos_pattern() pos = pos.replace(SEPARATOR, " ") self.add_string("candid=%(id)s pos=\"%(pos)s\"\n" % \ {"id": candidate.id_number, "pos": pos}) for form in candidate.occurs: form.set_all(lemma="", pos="") occur = form.to_string() occur = occur.replace(SEPARATOR, "") occur = occur.replace(WORD_SEPARATOR, " ") self.add_string(("\"%(occur)s\"\n" % {"occur": occur}).encode('utf-8')) self.add_string("\n") ################################################################################ # MAIN SCRIPT args = read_options("", [], treat_options_simplest, -1, usage_string) filetype.parse(args, EvitaPrinter("candidates"))
if o in ("-m", "--measures"): try: measures = [] measures = interpret_measures(a) except ValueError as message: error( str(message) + "\nargument must be list separated by " "\":\" and containing the names: " + str(supported_measures)) elif o in ("-o", "--original"): main_freq_name = a elif o in ("-a", "--all"): join_all_contrastive = True if not main_freq_name: error("Option -o is mandatory") ################################################################################ # MAIN SCRIPT longopts = ["measures=", "original=", "all"] args = read_options("m:o:a", longopts, treat_options, 1, usage_string) for a in args: verbose("Pass 1 for " + a) filetype.parse([a], TotalCalculatorHandler()) # First calculate Nc for each contrastive corpus verbose("Pass 2 for " + a) filetype.parse([a], MeasureCalculatorHandler())
class EvitaPrinter(filetype.common.AbstractPrinter): filetype_info = INFO valid_categories = ["candidates"] def handle_candidate(self, candidate, info={}): """For each `Candidate`, print the candidate ID, its POS pattern and the list of occurrences one per line @param candidate The `Candidate` that is being read from the XML file. """ pos = candidate.get_pos_pattern() pos = pos.replace(SEPARATOR, " ") self.add_string("candid=%(id)s pos=\"%(pos)s\"\n" % \ {"id": candidate.id_number, "pos": pos}) for form in candidate.occurs: form.set_all(lemma="", pos="") occur = form.to_string() occur = occur.replace(SEPARATOR, "") occur = occur.replace(WORD_SEPARATOR, " ") self.add_string(("\"%(occur)s\"\n" % { "occur": occur }).encode('utf-8')) self.add_string("\n") ################################################################################ # MAIN SCRIPT args = read_options("", [], treat_options_simplest, -1, usage_string) filetype.parse(args, EvitaPrinter("candidates"))
@param n_arg The number of arguments expected for this script. """ global simplify global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) simplify = simplify_ptb for (o, a) in opts: if o in ("-p", "--palavras"): simplify = simplify_palavras elif o in ("-G", "--genia"): simplify = simplify_genia elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "palavras", "genia"] args = read_options("xF:pg", longopts, treat_options, -1, usage_string) filetype.parse(args, FilterHandler(), input_filetype_ext)
elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps) ################################################################################ # MAIN SCRIPT longopts = ["corpus-from=", "candidates-from=", "to=", "candidates=", "detector=", "gaps=", "source", "filter", "filter-and-annot"] arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string) filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
@param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string for the current script. """ global attributes treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-a", "--attributes"): attributes = a.split(":") for attr in attributes: if attr not in WORD_ATTRIBUTES: error("Unknown attribute '%s'!" % attr) if attributes is None: print >> sys.stderr, "The option -a <attributes> is mandatory." usage(usage_string) sys.exit(2) ################################################################################ # MAIN SCRIPT longopts = ["atttibutes="] arg = read_options("a:", longopts, treat_options, -1, usage_string) filetype.parse(arg, TxtGeneratorHandler())
elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name: filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name)) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs: error("You MUST provide a non-empty reference list!") ################################################################################ # MAIN SCRIPT longopts = [ "input-from=", "reference-from=", "reference=", "ignore-pos", "case", "lemma-or-surface" ] args = read_options("r:gcL", longopts, treat_options, -1, usage_string) filetype.parse(args, EvaluatorHandler(), input_filetype_ext)