print_cand_freq = True elif o in ("-i", "--index") : input_filetype_ext = "BinaryIndex" warn("Option -i is deprecated; use --from=BinaryIndex") elif o == "--id-order": id_order = a.split(":") elif o == "--from" : input_filetype_ext = a elif o == "--to" : output_filetype_ext = a else: raise Exception("Bad flag") if non_overlapping and match_distance == "All": # If we are taking all matches, we need to be able to overlap... error("Conflicting options: --match-distance=All and --non-overlapping") if len(mode) != 1 : error("Exactly one option, -p or -n, must be provided") if "patterns" in mode: global patterns patterns = filetype.parse_entities([patterns_file]) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "patterns=", "ngram=", "index", "match-distance=", "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order=" ] arg = read_options( "p:n:id:NfgsS", longopts, treat_options, -1, usage_string ) filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one"+\ " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f") ################################################################################ # MAIN SCRIPT longopts = ["from=", "feat=", "asc", "desc", "precs"] args = read_options("f:adp", longopts, treat_options, 1, usage_string) filetype.parse(args, StatsCollectorHandler(), input_filetype_ext) print_stats()
for k1 in cache1.keys() : cache_out[ k1 ] = cache1[ k1 ] # Update entries in cache_out if corresponding entry in cache_2 is newer for k2 in cache2.keys() : ( freq2, date2 ) = cache2[ k2 ] ( freq_out, date_out ) = cache_out.get( k2, ( -1, None ) ) if date_out is None : cache_out[ k2 ] = ( freq2, date2 ) elif date2 < date_out : cache_out[ k2 ] = ( freq2, date2 ) ################################################################################ # MAIN SCRIPT longopts = [] arg = read_options( "", longopts, treat_options_simplest, 3, usage_string ) verbose( "Opening files and checking consistency" ) cache1_desc = open( arg[ 0 ], "r" ) cache2_desc = open( arg[ 1 ], "r" ) cache_out_desc = open( arg[ 2 ], "w" ) cache1 = cPickle.load( cache1_desc ) cache2 = cPickle.load( cache2_desc ) cache_out = {} verbose( "Combining cache files..." ) combine_caches( cache1, cache2, cache_out ) verbose( "Writing new cache file..." ) cPickle.dump( cache_out, cache_out_desc ) verbose( "{c} had {n} entries".format(c=arg[ 0 ], n=len(cache1)) ) verbose( "{c} had {n} entries".format(c=arg[ 1 ], n=len(cache2)) ) verbose( "Result has {n} entries".format(n=len(cache_out)) )
global limit global entity_buffer global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-n", "--number"): try: limit = int(a) entity_buffer = [None] * limit if limit < 0: raise ValueError except ValueError: error("You must provide a positive " + \ "integer value as argument of -n option.") else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT args = read_options("n:", ["from=", "to=", "number="], treat_options, -1, usage_string) filetype.parse(args, TailPrinterHandler(limit), input_filetype_ext)
""" global combination global supported_combination global main_freq treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-c", "--combination"): try: combination = [] combination = interpret_combinations(a) except ValueError as message: print >> sys.stderr, message print >> sys.stderr, "ERROR: argument must be list separated"+ \ "by \":\" and containing the names: "+\ str( supported_combination ) usage(usage_string) sys.exit(2) elif o in ("-o", "--original"): main_freq = a ################################################################################ # MAIN SCRIPT longopts = ["combination=", "original="] args = read_options("c:o:", longopts, treat_options, -1, usage_string) filetype.parse(args, FreqCombinerHandler())
treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-l","--lemmas" ) : lower_attr = "lemma" elif o in ("-a", "--algorithm"): algoname = a.lower() elif o in ("-m", "-x"): error( "Deprecated options -x and -m. Run with -h for details" ) else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "algorithm=", "lemmas" ] args = read_options( "a:xml", longopts, treat_options, 1, usage_string ) if algoname != "simple" : verbose( "Pass 1: Reading vocabulary from file... please wait" ) filetype.parse(args, VocabReaderHandler(), input_filetype_ext) verbose( "Pass 2: Lowercasing the words in the file" ) filetype.parse(args, LowercaserHandler(), input_filetype_ext)
if algoname == "simple" : # Redundant, kept for clarity sent_handler = LowercaserHandler.handle_sentence_simple elif algoname == "complex" : sent_handler = LowercaserHandler.handle_sentence_complex elif algoname == "aggressive" : # Redundant, kept for clarity sent_handler = LowercaserHandler.handle_sentence_aggressive else : ctxinfo.error("Bad algorithm name `{name}`", name=algoname) elif o == "-m": ctxinfo.error("Deprecated option. Use --from=Moses instead" ) elif o == "-x": ctxinfo.error("Deprecated option. " \ "Use --from=PlainCorpus instead") else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "algorithm=", "lemmas" ] args = util.read_options( "a:xml", longopts, treat_options, 1, usage_string ) if sent_handler != LowercaserHandler.handle_sentence_simple : util.verbose( "Pass 1: Reading vocabulary from file... please wait" ) filetype.parse(args, VocabReaderHandler(), input_filetype_ext) util.verbose( "Pass 2: Lowercasing the words in the file" ) filetype.parse(args, LowercaserHandler(), input_filetype_ext)
if o in ("-s", "--surface"): surface_instead_lemmas = True base_attr = 'surface' elif o in ("-f", "--freq"): min_frequency = int(a) elif o in ("-n", "--ngram"): (min_ngram, max_ngram) = interpret_ngram(a) elif o in ("-i", "--index"): corpus_from_index = True elif o in ("-G", "--glue"): if a == "scp": glue = scp_glue else: error("Unknown glue function '%s'" % a) ################################################################################ corpus_from_index = False base_attr = 'lemma' glue = scp_glue min_ngram = 2 max_ngram = 8 min_frequency = 2 longopts = ["surface", "glue=", "ngram=", "freq=", "index"] arg = read_options("sG:n:f:i", longopts, treat_options, 1, usage_string) corpus_path = arg[0] main()
use_text_format = "moses" elif o in ("-c", "--conll"): use_text_format = "conll" elif o in ("-o", "--old"): indexlib.Index.use_c_indexer(False) if basename is None: error("You must provide a filename for the index.\n" "Option -i is mandatory.") ################################################################################ # MAIN SCRIPT longopts = ["from=", "index=", "attributes=", "old", "moses", "conll"] arg = read_options("i:a:omc", longopts, treat_options, -1, usage_string) simple_attrs = [a for a in used_attributes if '+' not in a] composite_attrs = [a for a in used_attributes if '+' in a] for attrs in [attr.split('+') for attr in composite_attrs]: for attr in attrs: if attr not in simple_attrs: simple_attrs.append(attr) index = indexlib.Index(basename, simple_attrs) indexlib.populate_index(index, arg, input_filetype_ext) for attr in composite_attrs: index.make_fused_array(attr.split('+')) #index.build_suffix_arrays() #index.save_main()
a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f") ################################################################################ # MAIN SCRIPT longopts = ["from=", "feat=", "asc", "desc", "precs"] args = read_options("f:adp", longopts, treat_options, 1, usage_string) filetype.parse(args, StatsCollectorHandler(), input_filetype_ext) print_stats()
unknown = a if o in ("-s", "--separator") : verbose( "Field separator: " + a ) separator = a if len( separator ) > 1 : warn("Multi-char field separator!") if o in ("-d", "--distance") : verbose("Calculating weighted coefficients using distance file") distances_matrix = read_distances( a ) if distances_matrix is None : warn("Error in distance matrix! Weighted coefficients will use 1.0 as default distance") if o in ("-c", "--confusion") : verbose( "Calculating confusion matrices" ) calculate_confusion = True ################################################################################ # MAIN SCRIPT longopts = [ "raters", "items", "pairwise", "separator=", "distance=", "confusion", "unknown=" ] arg = read_options( "rips:d:cu:", longopts, treat_options, -1, usage_string ) if len( arg ) == 0 : (annotations, Ni, Nc, Nk, categ_names) = read_data( sys.stdin ) calculate_and_print( annotations, Ni, Nc, Nk, categ_names ) else : for a in arg : input_file = open( a ) (annotations, Ni, Nc, Nk, categ_names) = read_data( input_file ) calculate_and_print( annotations, Ni, Nc, Nk, categ_names )
else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided") #elif text_input and web_freq is None: # warn("-x option is recommended for web queries, not textual indices") ################################################################################ # MAIN SCRIPT longopts = [ "candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=", "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams", "univ=", "web1t=" ] args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1, usage_string) try: verbose("Counting ngrams in candidates file") filetype.parse(args, CounterPrinter(), filetype_candidates_ext) finally: if web_freq: web_freq.flush_cache() # VERY IMPORTANT!
elif o in ("-p", "--patterns"): input_patterns = filetype.parse_entities([a]) elif o in ("-d", "--match-distance") : match_distance = a elif o in ("-N", "--non-overlapping") : non_overlapping = True elif o == "--id-order": id_order = a.split(":") elif o == "--annotate": annotate = True elif o == "--only-matching": only_the_matching_subpart = True else: raise Exception("Bad arg " + o) if input_patterns is None: util.error("No patterns provided. Option --patterns is mandatory!") if only_the_matching_subpart and annotate: util.warn("Switch --only-matching disables --annotate") ################################################################################ # MAIN SCRIPT longopts = ["input-from=", "to=", "patterns=", "match-distance=", "non-overlapping=", "id-order=", "annotate", "only-matching"] args = util.read_options("p:d:N", longopts, treat_options, -1, usage_string) filetype.parse(args, GrepHandler(), input_filetype_ext)
ignore_pos = True elif o in ("-c", "--case"): ignore_case = False elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name : filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) ) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs : error("You MUST provide a non-empty reference list!") ################################################################################ # MAIN SCRIPT longopts = ["input-from=", "reference-from=", "reference=", "ignore-pos", "case", "lemma-or-surface"] args = read_options( "r:gcL", longopts, treat_options, -1, usage_string ) filetype.parse(args, EvaluatorHandler(), input_filetype_ext)
""" global SEPCHAR global SURFACE_FLAG for (o, a) in opts: if o == "-F": # sets a new separator character to be used when spliting a line SEPCHAR = a elif o == "-s": # sets the assignment of a word to the "surface" item. # default is set to "lemma". SURFACE_FLAG = 1 else: error("Option " + o + " is not a valid option") ################################################################################ # MAIN SCRIPT if __name__ == '__main__': files = read_options("F:s", [], treat_options_csv2xml, 2, usage_string) for file in files: initialize(file) print(XML_HEADER % {"category": "candidates", "ns": ""}) getMeta(file) getCand(file) print(XML_FOOTER % {"category": "candidates"})
for ( o, a ) in opts: if o in ( "-m", "--measures" ) : try : measures = [] measures = interpret_measures( a ) except ValueError as message : error( str(message)+"\nargument must be list separated by " "\":\" and containing the names: "+ str( supported_measures )) elif o in ( "-o", "--original" ) : main_freq_name = a elif o in ( "-a", "--all" ) : join_all_contrastive = True if not main_freq_name : error( "Option -o is mandatory") ################################################################################ # MAIN SCRIPT longopts = ["measures=", "original=", "all"] args = read_options( "m:o:a", longopts, treat_options, 1, usage_string ) for a in args : verbose( "Pass 1 for " + a ) filetype.parse([a], TotalCalculatorHandler()) # First calculate Nc for each contrastive corpus verbose( "Pass 2 for " + a ) filetype.parse([a], MeasureCalculatorHandler())
@param n_arg The number of arguments expected for this script. """ global simplify global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) simplify = simplify_ptb for (o, a) in opts: if o in ("-p", "--palavras"): simplify = simplify_palavras elif o in ("-G", "--genia"): simplify = simplify_genia elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "palavras", "genia"] args = read_options("xF:pg", longopts, treat_options, -1, usage_string) filetype.parse(args, FilterHandler(), input_filetype_ext)
warn("Multi-char field separator!") if o in ("-d", "--distance"): verbose("Calculating weighted coefficients using distance file") distances_matrix = read_distances(a) if distances_matrix is None: warn( "Error in distance matrix! Weighted coefficients will use 1.0 as default distance" ) if o in ("-c", "--confusion"): verbose("Calculating confusion matrices") calculate_confusion = True ################################################################################ # MAIN SCRIPT longopts = [ "raters", "items", "pairwise", "separator=", "distance=", "confusion", "unknown=" ] arg = read_options("rips:d:cu:", longopts, treat_options, -1, usage_string) if len(arg) == 0: (annotations, Ni, Nc, Nk, categ_names) = read_data(sys.stdin) calculate_and_print(annotations, Ni, Nc, Nk, categ_names) else: for a in arg: input_file = open(a) (annotations, Ni, Nc, Nk, categ_names) = read_data(input_file) calculate_and_print(annotations, Ni, Nc, Nk, categ_names)
output_filetype_ext = a elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps) ################################################################################ # MAIN SCRIPT longopts = [ "corpus-from=", "candidates-from=", "to=", "candidates=", "detector=", "gaps=", "source", "filter", "filter-and-annot" ] arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string) filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
use_text_format = "moses" elif o in ("-c", "--conll"): use_text_format = "conll" elif o in ("-o", "--old"): indexlib.Index.use_c_indexer(False) if basename is None: error("You must provide a filename for the index.\n" "Option -i is mandatory.") ################################################################################ # MAIN SCRIPT longopts = ["from=", "index=", "attributes=", "old", "moses", "conll" ] arg = read_options( "i:a:omc", longopts, treat_options, -1, usage_string ) simple_attrs = [a for a in used_attributes if '+' not in a] composite_attrs = [a for a in used_attributes if '+' in a] for attrs in [attr.split('+') for attr in composite_attrs]: for attr in attrs: if attr not in simple_attrs: simple_attrs.append(attr) index = indexlib.Index(basename, simple_attrs) indexlib.populate_index(index, arg, input_filetype_ext) for attr in composite_attrs: index.make_fused_array(attr.split('+')) #index.build_suffix_arrays()
elif o in ("-i", "--minlength"): minlength = interpret_length(a, "minimum") elif o in ("-a", "--maxlength"): maxlength = interpret_length(a, "maximum") elif o == "--min-mweoccurs": min_mweoccurs = interpret_length(a, "minimum") elif o == "--max-mweoccurs": max_mweoccurs = interpret_length(a, "maximum") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if minlength > maxlength: warn("minlength should be <= maxlength") if min_mweoccurs > max_mweoccurs: warn("min-mweoccurs should be <= max-mweoccurs") ################################################################################ # MAIN SCRIPT longopts = [ "threshold=", "equals=", "patterns=", "reverse", "maxlength=", "minlength=", "min-mweoccurs=", "max-mweoccurs=", "from=", "to=" ] args = read_options("t:e:p:ra:i:", longopts, treat_options, -1, usage_string) filetype.parse(args, FilterHandler(), input_filetype_ext)
""" global executable_w global executable_beg global executable_end global input_filetype_ext global output_filetype_ext util.treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o == "--begin": executable_beg = compile(a, "<cmdline:--begin>", "exec") elif o == "--end": executable_end = compile(a, "<cmdline:--end>", "exec") elif o in ("-w", "--each-word"): executable_w = compile(a, "<cmdline:--each-word>", "exec") else: raise Exception("Bad arg " + o) ################################################################################ # MAIN SCRIPT longopts = ["from=", "to=", "begin=", "end=", "each-word="] args = util.read_options("w:", longopts, treat_options, -1, usage_string) filetype.parse(args, TransformHandler(), input_filetype_ext)
else: l_empty=0 first_line=True else: process_tree_branch(l,phrase) l=unicode( rasp.readline(), "utf-8" ) if l_empty != 1 and len(phrase) != 0 : #save last entry write_entry(n_line,map( lambda x: x[1], sorted( phrase.items() ) )) if morphg_folder : os.chdir( work_path ) ############################################################################### # MAIN SCRIPT longopts = ["morphg=", "moses"] arg = read_options( "m:x", longopts, treat_options, -1, usage_string ) if not generate_text : print( XML_HEADER % { "category": "corpus", "ns": "" } ) if len( arg ) == 0 : transform_format( sys.stdin ) else : for a in arg : try: input_file=open(a, 'r') except IOError as e: error( 'Error opening file for reading.' ) transform_format( input_file ) input_file.close()
@param n_arg The number of arguments expected for this script. """ global SEPCHAR global SURFACE_FLAG for ( o , a ) in opts: if o == "-F": # sets a new separator character to be used when spliting a line SEPCHAR = a elif o == "-s": # sets the assignment of a word to the "surface" item. # default is set to "lemma". SURFACE_FLAG = 1 else: error("Option " + o + " is not a valid option") ################################################################################ # MAIN SCRIPT if __name__ == '__main__': files = read_options( "F:s", [], treat_options_csv2xml, 2, usage_string ) for file in files: initialize(file) print(XML_HEADER % { "category":"candidates", "ns":"" }) getMeta(file) getCand(file) print(XML_FOOTER % { "category":"candidates" })
suffix_array = index.load("lemma+pos") else: # Web search, entries are single surface or lemma forms if surface_flag: build_entry = lambda surface, lemma, pos: surface else: build_entry = lambda surface, lemma, pos: lemma if len(mode) != 1: error("Exactly one option -u, -w or -i, must be provided") #elif text_input and web_freq is None: # warn("-x option is recommended for web queries, not textual indices") ################################################################################ # MAIN SCRIPT longopts = ["candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=", "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams", "univ=", "web1t="] args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1, usage_string) try: verbose("Counting ngrams in candidates file") filetype.parse(args, CounterPrinter(), filetype_candidates_ext) finally: if web_freq: web_freq.flush_cache() # VERY IMPORTANT!
INFO = EvitaInfo() class EvitaPrinter(filetype.common.AbstractPrinter): filetype_info = INFO valid_categories = ["candidates"] def handle_candidate(self, candidate, info={}): """For each `Candidate`, print the candidate ID, its POS pattern and the list of occurrences one per line @param candidate The `Candidate` that is being read from the XML file. """ pos = candidate.get_pos_pattern() pos = pos.replace(SEPARATOR, " ") self.add_string("candid=%(id)s pos=\"%(pos)s\"\n" % \ {"id": candidate.id_number, "pos": pos}) for form in candidate.occurs: form.set_all(lemma="", pos="") occur = form.to_string() occur = occur.replace(SEPARATOR, "") occur = occur.replace(WORD_SEPARATOR, " ") self.add_string(("\"%(occur)s\"\n" % {"occur": occur}).encode('utf-8')) self.add_string("\n") ################################################################################ # MAIN SCRIPT args = read_options("", [], treat_options_simplest, -1, usage_string) filetype.parse(args, EvitaPrinter("candidates"))
elif o == "--id-order": id_order = a.split(":") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad flag") if non_overlapping and match_distance == "All": # If we are taking all matches, we need to be able to overlap... error( "Conflicting options: --match-distance=All and --non-overlapping") if len(mode) != 1: error("Exactly one option, -p or -n, must be provided") if "patterns" in mode: global patterns patterns = filetype.parse_entities([patterns_file]) ################################################################################ # MAIN SCRIPT longopts = [ "from=", "to=", "patterns=", "ngram=", "index", "match-distance=", "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order=" ] arg = read_options("p:n:id:NfgsS", longopts, treat_options, -1, usage_string) filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
elif o == "--filter": action_annotate = False action_filter = True elif o == "--filter-and-annot": action_filter = True else: raise Exception("Bad arg: " + o) if not candidates_fnames: error("No candidates file given!") if detector_class == SourceDetector and n_gaps is not None: error('Bad arguments: method "Source" with "--gaps"') c = CandidatesHandler() verbose("Reading MWE list from candidates file") filetype.parse(candidates_fnames, c, filetype_candidates_ext) verbose("MWE list loaded in memory successfully") global detector detector = detector_class(c.info, n_gaps) ################################################################################ # MAIN SCRIPT longopts = ["corpus-from=", "candidates-from=", "to=", "candidates=", "detector=", "gaps=", "source", "filter", "filter-and-annot"] arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string) filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
cache_out[k1] = cache1[k1] # Update entries in cache_out if corresponding entry in cache_2 is newer for k2 in cache2.keys(): (freq2, date2) = cache2[k2] (freq_out, date_out) = cache_out.get(k2, (-1, None)) if date_out is None: cache_out[k2] = (freq2, date2) elif date2 < date_out: cache_out[k2] = (freq2, date2) ################################################################################ # MAIN SCRIPT longopts = [] arg = read_options("", longopts, treat_options_simplest, 3, usage_string) verbose("Opening files and checking consistency") cache1_desc = open(arg[0], "r") cache2_desc = open(arg[1], "r") cache_out_desc = open(arg[2], "w") cache1 = cPickle.load(cache1_desc) cache2 = cPickle.load(cache2_desc) cache_out = {} verbose("Combining cache files...") combine_caches(cache1, cache2, cache_out) verbose("Writing new cache file...") cPickle.dump(cache_out, cache_out_desc) verbose("{c} had {n} entries".format(c=arg[0], n=len(cache1))) verbose("{c} had {n} entries".format(c=arg[1], n=len(cache2))) verbose("Result has {n} entries".format(n=len(cache_out)))
@param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global surface_instead_lemmas global lemmapos global input_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True elif o in ("-p", "--lemmapos") : lemmapos = True elif o in ("-f", "--freq-source") : freq_source = a elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) ################################################################################ # MAIN SCRIPT longopts = [ "surface", "lemmapos", "freq-source=" "from=" ] args = read_options( "spf:", longopts, treat_options, -1, usage_string ) handler = ft_ucs.UCSPrinter("candidates", freq_source=freq_source, lemmapos=lemmapos, surfaces=surface_instead_lemmas) filetype.parse(args, handler, input_filetype_ext)
for ( o, a ) in opts: if o in ("-s", "--surface") : surface_instead_lemmas = True base_attr = 'surface' elif o in ("-f", "--freq") : min_frequency = int(a) elif o in ("-n", "--ngram") : (min_ngram, max_ngram) = interpret_ngram(a) elif o in ("-i", "--index") : corpus_from_index = True elif o in ("-G", "--glue"): if a == "scp": glue = scp_glue else: error("Unknown glue function '%s'" % a) ################################################################################ corpus_from_index = False base_attr = 'lemma' glue = scp_glue min_ngram = 2 max_ngram = 8 min_frequency = 2 longopts = ["surface", "glue=", "ngram=", "freq=", "index"] arg = read_options("sG:n:f:i", longopts, treat_options, 1, usage_string) corpus_path = arg[0] main()
verbose("Option REVERSE active") elif o in ("-i", "--minlength") : minlength = interpret_length( a, "minimum" ) elif o in ("-a", "--maxlength") : maxlength = interpret_length( a, "maximum" ) elif o == "--min-mweoccurs": min_mweoccurs = interpret_length(a, "minimum") elif o == "--max-mweoccurs": max_mweoccurs = interpret_length(a, "maximum") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if minlength > maxlength: warn("minlength should be <= maxlength") if min_mweoccurs > max_mweoccurs: warn("min-mweoccurs should be <= max-mweoccurs") ################################################################################ # MAIN SCRIPT longopts = [ "threshold=", "equals=", "patterns=", "reverse", "maxlength=", "minlength=", "min-mweoccurs=", "max-mweoccurs=", "from=", "to=" ] args = read_options( "t:e:p:ra:i:", longopts, treat_options, -1, usage_string ) filetype.parse(args, FilterHandler(), input_filetype_ext)
if o in ("-m", "--measures"): try: measures = [] measures = interpret_measures(a) except ValueError as message: error( str(message) + "\nargument must be list separated by " "\":\" and containing the names: " + str(supported_measures)) elif o in ("-o", "--original"): main_freq_name = a elif o in ("-a", "--all"): join_all_contrastive = True if not main_freq_name: error("Option -o is mandatory") ################################################################################ # MAIN SCRIPT longopts = ["measures=", "original=", "all"] args = read_options("m:o:a", longopts, treat_options, 1, usage_string) for a in args: verbose("Pass 1 for " + a) filetype.parse([a], TotalCalculatorHandler()) # First calculate Nc for each contrastive corpus verbose("Pass 2 for " + a) filetype.parse([a], MeasureCalculatorHandler())
print("\t " + str(word.attrib)) currentMWE = candidateId + ";" + ngram.find('freq').get("value") + ";" print() for word in ngram.xpath('./w'): print(word.attrib) currentMWE = currentMWE + str(word.attrib) + ";" if currentMWE.count(";") == 4: currentMWE = currentMWE + ";" validation = raw_input("Is this a MWE: ") print("\t You entered: ", validation) if validation == ".": validatedMWEfile.close() exit() currentMWE = currentMWE + validation validatedMWEfile.write(currentMWE + "\n") totalValidated = totalValidated + 1 print("\t ---> Total validated:", str(totalValidated)) validatedMWEfile.close() ################################################################################ # MAIN SCRIPT args = read_options("", [], treat_options_simplest, 2, usage_string) candidates_filename = args[0] output_filename = args[1] annotate_candidates()
@param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string for the current script. """ global attributes treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-a", "--attributes"): attributes = a.split(":") for attr in attributes: if attr not in WORD_ATTRIBUTES: error("Unknown attribute '%s'!" % attr) if attributes is None: print >> sys.stderr, "The option -a <attributes> is mandatory." usage(usage_string) sys.exit(2) ################################################################################ # MAIN SCRIPT longopts = ["atttibutes="] arg = read_options("a:", longopts, treat_options, -1, usage_string) filetype.parse(arg, TxtGeneratorHandler())
elif o in ("-L", "--lemma-or-surface"): lemma_or_surface = True elif o == "--input-from": input_filetype_ext = a elif o == "--reference-from": reference_filetype_ext = a else: raise Exception("Bad arg: " + o) # The reference list needs to be opened after all the options are read, # since options such as -g and -c modify the way the list is represented if ref_name: filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext) gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name)) # There's no reference list... Oh oh cannot evaluate :-( if not pre_gs: error("You MUST provide a non-empty reference list!") ################################################################################ # MAIN SCRIPT longopts = [ "input-from=", "reference-from=", "reference=", "ignore-pos", "case", "lemma-or-surface" ] args = read_options("r:gcL", longopts, treat_options, -1, usage_string) filetype.parse(args, EvaluatorHandler(), input_filetype_ext)