Exemple #1
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global filetype_corpus_ext
    global filetype_candidates_ext
    global output_filetype_ext
    global action_annotate
    global action_filter

    treat_options_simplest(opts, arg, n_arg, usage_string)

    detector_class = ContiguousLemmaDetector
    candidates_fnames = []
    n_gaps = None

    for (o, a) in opts:
        if o in ("-c", "--candidates"):
            candidates_fnames.append(a)
        elif o in ("-d", "--detector"):
            detector_class = detectors.get(a,None)
            if detector_class is None :
                error("Unkown detector name: "+a)
        elif o in ("-S", "--source"):
            detector_class = SourceDetector
        elif o in ("-g", "--gaps"):
            n_gaps = int(a)
        elif o == "--corpus-from":
            filetype_corpus_ext = a
        elif o == "--candidates-from":
            filetype_candidates_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":            
            action_filter = True            
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames,
            c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)
Exemple #2
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global filetype_corpus_ext
    global filetype_candidates_ext
    global output_filetype_ext
    global action_annotate
    global action_filter

    treat_options_simplest(opts, arg, n_arg, usage_string)

    detector_class = ContiguousLemmaDetector
    candidates_fnames = []
    n_gaps = None

    for (o, a) in opts:
        if o in ("-c", "--candidates"):
            candidates_fnames.append(a)
        elif o in ("-d", "--detector"):
            detector_class = detectors.get(a, None)
            if detector_class is None:
                error("Unkown detector name: " + a)
        elif o in ("-S", "--source"):
            detector_class = SourceDetector
        elif o in ("-g", "--gaps"):
            n_gaps = int(a)
        elif o == "--corpus-from":
            filetype_corpus_ext = a
        elif o == "--candidates-from":
            filetype_candidates_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":
            action_filter = True
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames, c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)
Exemple #3
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global pre_gs
    global ignore_pos
    global gs_name
    global ignore_case
    global lemma_or_surface
    global input_filetype_ext
    global reference_filetype_ext
    ref_name = None

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-r", "--reference"):
            ref_name = a
        elif o in ("-g", "--ignore-pos"):
            ignore_pos = True
        elif o in ("-c", "--case"):
            ignore_case = False
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name:
        filetype.parse([ref_name], ReferenceReaderHandler(),
                       reference_filetype_ext)
        gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name))
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs:
        error("You MUST provide a non-empty reference list!")
Exemple #4
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global pre_gs
    global ignore_pos
    global gs_name
    global ignore_case
    global lemma_or_surface
    global input_filetype_ext
    global reference_filetype_ext
    ref_name = None
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    for ( o, a ) in opts:
        if o in ("-r", "--reference"):
             ref_name = a
        elif o in ("-g", "--ignore-pos"):
            ignore_pos = True
        elif o in ("-c", "--case"):
            ignore_case = False
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
            
    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name :
        filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext)
        gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) )
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs :
        error("You MUST provide a non-empty reference list!")
Exemple #5
0
def main(corpus_paths):
    """
        Main function.
    """
    global use_shelve, ngram_counts, selected_candidates
    # Dummy file initialization to avoid warnings in PyCharm
    ngram_counts_tmpfile = selected_candidates_tmpfile = None
    if use_shelve:
        verbose("Making temporary file...")
        (ngram_counts, ngram_counts_tmpfile) = make_shelve()
        (selected_candidates, selected_candidates_tmpfile) = make_shelve()

    verbose("Counting ngrams...")
    filetype.parse(corpus_paths, NGramCounterHandler(), input_filetype_ext)

    if use_shelve:
        verbose("Removing temporary files...")
        destroy_shelve(ngram_counts, ngram_counts_tmpfile)
        destroy_shelve(selected_candidates, selected_candidates_tmpfile)
Exemple #6
0
def main(corpus_paths):
    """
        Main function.
    """
    global use_shelve, ngram_counts, selected_candidates
    # Dummy file initialization to avoid warnings in PyCharm
    ngram_counts_tmpfile = selected_candidates_tmpfile = None
    if use_shelve:
        verbose("Making temporary file...")
        (ngram_counts, ngram_counts_tmpfile) = make_shelve()
        (selected_candidates, selected_candidates_tmpfile) = make_shelve()

    verbose("Counting ngrams...")
    filetype.parse(corpus_paths, NGramCounterHandler(), input_filetype_ext)

    if use_shelve:
        verbose("Removing temporary files...")
        destroy_shelve(ngram_counts, ngram_counts_tmpfile)
        destroy_shelve(selected_candidates, selected_candidates_tmpfile)
Exemple #7
0
            print_cand_freq = True
        elif o in ("-i", "--index") :
            input_filetype_ext = "BinaryIndex"
            warn("Option -i is deprecated; use --from=BinaryIndex")
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--from" :
            input_filetype_ext = a
        elif o == "--to" :
            output_filetype_ext = a
        else:
            raise Exception("Bad flag")

    if non_overlapping and match_distance == "All":
        # If we are taking all matches, we need to be able to overlap...
        error("Conflicting options: --match-distance=All and --non-overlapping")

    if len(mode) != 1 :
        error("Exactly one option, -p or -n, must be provided")
    if "patterns" in mode:
        global patterns
        patterns = filetype.parse_entities([patterns_file])

################################################################################  
# MAIN SCRIPT

longopts = [ "from=", "to=", "patterns=", "ngram=", "index", "match-distance=",
        "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order=" ]
arg = read_options( "p:n:id:NfgsS", longopts, treat_options, -1, usage_string )
filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
Exemple #8
0
    """
    global combination
    global supported_combination
    global main_freq

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-c", "--combination"):
            try:
                combination = []
                combination = interpret_combinations(a)
            except ValueError as message:
                print >> sys.stderr, message
                print >> sys.stderr, "ERROR: argument must be list separated"+ \
                                     "by \":\" and containing the names: "+\
                                     str( supported_combination )
                usage(usage_string)
                sys.exit(2)
        elif o in ("-o", "--original"):
            main_freq = a


################################################################################
# MAIN SCRIPT

longopts = ["combination=", "original="]
args = read_options("c:o:", longopts, treat_options, -1, usage_string)

filetype.parse(args, FreqCombinerHandler())
Exemple #9
0
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one"+\
             " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")


################################################################################
# MAIN SCRIPT

longopts = ["from=", "feat=", "asc", "desc", "precs"]
args = read_options("f:adp", longopts, treat_options, 1, usage_string)
filetype.parse(args, StatsCollectorHandler(), input_filetype_ext)
print_stats()
Exemple #10
0
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global lemmapos
    global input_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )
        
    for ( o, a ) in opts:        
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True     
        elif o in ("-p", "--lemmapos") : 
            lemmapos = True   
        elif o == "--from":
            input_filetype_ext = a                          
        else:
            raise Exception("Bad arg: " + o)

################################################################################     
# MAIN SCRIPT

longopts = [ "surface", "lemmapos", "from=" ]
args = read_options( "sp", longopts, treat_options, -1, usage_string )
handler = ft_csv.CSVPrinter("candidates", lemmapos=lemmapos,
        surfaces=surface_instead_lemmas)
filetype.parse(args, handler, input_filetype_ext)
Exemple #11
0
    global limit
    global entity_buffer
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-n", "--number"):
            try:
                limit = int(a)
                entity_buffer = [None] * limit
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive " + \
                      "integer value as argument of -n option.")
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

args = read_options("n:", ["from=", "to=", "number="], treat_options, -1, usage_string)
filetype.parse(args, TailPrinterHandler(limit), input_filetype_ext)
Exemple #12
0
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    @param usage_string Instructions that appear if you run the program with
    the wrong parameters or options.
    """
    global sent_split
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for ( o, a ) in opts:
        if o in ("-s", "--sentence"):
            sent_split = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)



################################################################################     
# MAIN SCRIPT

longopts = ["sentence=", "to="]
args = read_options("s:", longopts, treat_options, -1, usage_string)
handler = filetype.AutomaticPrinterHandler(output_filetype_ext)
parser = ft_treetagger.TreeTaggerParser("utf-8", sent_split)
filetype.parse(args, handler, parser=parser)
Exemple #13
0
            ignore_pos = True
        elif o in ("-c", "--case"):
            ignore_case = False
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
            
    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name :
        filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext)
        gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) )
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs :
        error("You MUST provide a non-empty reference list!")


################################################################################
# MAIN SCRIPT

longopts = ["input-from=", "reference-from=",
        "reference=", "ignore-pos", "case", "lemma-or-surface"]
args = read_options( "r:gcL", longopts, treat_options, -1, usage_string )

filetype.parse(args, EvaluatorHandler(), input_filetype_ext)
Exemple #14
0
        @param n_arg The number of arguments expected for this script.
    """
    global web_freq

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    for ( o, a ) in opts:
        if o in ( "-y", "--yahoo" ):
            web_freq = YahooFreq()
            mode.append("yahoo")
        elif o in ( "-w", "--google" ):
            web_freq = GoogleFreq()
            mode.append("google")

    if len(mode) > 1:
        error("At most one option -y or -w, should be provided")


################################################################################
# MAIN SCRIPT

longopts = ["google", "yahoo"]
args = read_options("wy", longopts, treat_options, -1, usage_string)

try:
    filetype.parse(args, LemmatiserHandler())
finally:
    if web_freq:
        web_freq.flush_cache()
Exemple #15
0
            self.chain = self.make_printer(info, output_filetype_ext)
        self.chain.before_file(fileobj, info)


def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("--from"):
            input_filetype_ext = a
        elif o in ("--to"):
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

longopts = ["from=", "to="]
args = read_options("", longopts, treat_options, -1, usage_string)
filetype.parse(args, ConverterHandler(), input_filetype_ext)
Exemple #16
0
        @param n_arg The number of arguments expected for this script.
    """
    global web_freq

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    for (o, a) in opts:
        if o in ("-y", "--yahoo"):
            web_freq = YahooFreq()
            mode.append("yahoo")
        elif o in ("-w", "--google"):
            web_freq = GoogleFreq()
            mode.append("google")

    if len(mode) > 1:
        error("At most one option -y or -w, should be provided")


################################################################################
# MAIN SCRIPT

longopts = ["google", "yahoo"]
args = read_options("wy", longopts, treat_options, -1, usage_string)

try:
    filetype.parse(args, LemmatiserHandler())
finally:
    if web_freq:
        web_freq.flush_cache()
Exemple #17
0
    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
    #elif text_input and web_freq is None:
    #    warn("-x option is recommended for web queries, not textual indices")


################################################################################
# MAIN SCRIPT

longopts = [
    "candidates-from=", "corpus-from=", "to=", "yahoo", "google", "index=",
    "ignore-pos", "surface", "old", "lower=", "upper=", "vars", "lang=",
    "no-joint", "bigrams", "univ=", "web1t="
]
args = read_options("ywi:gsoal:Jbu:T:", longopts, treat_options, -1,
                    usage_string)

try:
    verbose("Counting ngrams in candidates file")
    filetype.parse(args, CounterPrinter(), filetype_candidates_ext)
finally:
    if web_freq:
        web_freq.flush_cache()  # VERY IMPORTANT!
Exemple #18
0

################################################################################

def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("--from"):
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)


################################################################################     
# MAIN SCRIPT

longopts = ["from="]
args = read_options("", longopts, treat_options, -1, usage_string)
relation_name = "stdin" if len(args) == 0 else args[0].replace(".xml", "")
filetype.parse(args, filetype.printer_class("ARFF")("corpus",
        relation_name=relation_name), input_filetype_ext)
Exemple #19
0
        elif o in ("-p", "--patterns"):
            input_patterns = filetype.parse_entities([a])
        elif o in ("-d", "--match-distance") : 
            match_distance = a
        elif o in ("-N", "--non-overlapping") : 
            non_overlapping = True
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--annotate":
            annotate = True
        elif o == "--only-matching":
            only_the_matching_subpart = True
        else:
            raise Exception("Bad arg " + o)

    if input_patterns is None:
        util.error("No patterns provided. Option --patterns is mandatory!")

    if only_the_matching_subpart and annotate:
        util.warn("Switch --only-matching disables --annotate")


################################################################################
# MAIN SCRIPT

longopts = ["input-from=", "to=", "patterns=",
        "match-distance=", "non-overlapping=", "id-order=", "annotate",
        "only-matching"]
args = util.read_options("p:d:N", longopts, treat_options, -1, usage_string)
filetype.parse(args, GrepHandler(), input_filetype_ext)
Exemple #20
0
    @param n_arg The number of arguments expected for this script.
    """
    global executable_w
    global executable_beg
    global executable_end
    global input_filetype_ext
    global output_filetype_ext
    
    util.treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--begin":
            executable_beg = compile(a, "<cmdline:--begin>", "exec")
        elif o == "--end":
            executable_end = compile(a, "<cmdline:--end>", "exec")
        elif o in ("-w", "--each-word"):
            executable_w = compile(a, "<cmdline:--each-word>", "exec")
        else:
            raise Exception("Bad arg " + o)

################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "begin=", "end=", "each-word="]
args = util.read_options("w:", longopts, treat_options, -1, usage_string)
filetype.parse(args, TransformHandler(), input_filetype_ext)
Exemple #21
0
        @param n_arg The number of arguments expected for this script.    
    """
    global ignore_pos
    global surface_instead_lemmas
    global input_filetype_ext
    global output_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-g", "--ignore-pos") :
            ignore_pos = True
        elif o in ("-s", "--surface") :
            surface_instead_lemmas = True
        else:
            raise Exception("Bad arg: " + o)


                
################################################################################    
# MAIN SCRIPT

longopts = [ "from=", "to=", "ignore-pos", "surface" ]
args = read_options( "gst", longopts, treat_options, -1, usage_string )

filetype.parse(args, UniqerHandler(), input_filetype_ext)
Exemple #22
0
    treat_options_simplest( opts, arg, n_arg, usage_string )        

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-l","--lemmas" ) :
            lower_attr = "lemma"
        elif o in ("-a", "--algorithm"):
            algoname = a.lower()
        elif o in ("-m", "-x"):
        	error( "Deprecated options -x and -m. Run with -h for details" )
        else:
            raise Exception("Bad arg: " + o)

 
################################################################################
# MAIN SCRIPT

longopts = [ "from=", "to=", "algorithm=", "lemmas" ]
args = read_options( "a:xml", longopts, treat_options, 1, usage_string )

if algoname != "simple" :
    verbose( "Pass 1: Reading vocabulary from file... please wait" )
    filetype.parse(args, VocabReaderHandler(), input_filetype_ext)

verbose( "Pass 2: Lowercasing the words in the file" )
filetype.parse(args, LowercaserHandler(), input_filetype_ext)
Exemple #23
0
    @param n_arg The number of arguments expected for this script.
    """
    global limit
    global input_filetype_ext
    global output_filetype_ext
    
    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-n", "--number"):
            try:
                limit = int( a )
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive " \
                         "integer value as argument of -n option.")
        else:
            raise Exception("Bad arg")

################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "number="]
args = read_options("n:", longopts, treat_options, -1, usage_string)
filetype.parse(args, HeadPrinterHandler(limit), input_filetype_ext)
            if algoname == "simple" :  # Redundant, kept for clarity
                sent_handler = LowercaserHandler.handle_sentence_simple
            elif algoname == "complex" :
                sent_handler = LowercaserHandler.handle_sentence_complex
            elif algoname == "aggressive" :  # Redundant, kept for clarity
                sent_handler = LowercaserHandler.handle_sentence_aggressive
            else :
                ctxinfo.error("Bad algorithm name `{name}`", name=algoname)

        elif o == "-m":
            ctxinfo.error("Deprecated option. Use --from=Moses instead" )
        elif o == "-x":
            ctxinfo.error("Deprecated option. " \
                    "Use --from=PlainCorpus instead")
        else:
            raise Exception("Bad arg: " + o)

 
################################################################################
# MAIN SCRIPT

longopts = [ "from=", "to=", "algorithm=", "lemmas" ]
args = util.read_options( "a:xml", longopts, treat_options, 1, usage_string )

if sent_handler != LowercaserHandler.handle_sentence_simple :
    util.verbose( "Pass 1: Reading vocabulary from file... please wait" )
    filetype.parse(args, VocabReaderHandler(), input_filetype_ext)

util.verbose( "Pass 2: Lowercasing the words in the file" )
filetype.parse(args, LowercaserHandler(), input_filetype_ext)
Exemple #25
0
    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")


################################################################################
# MAIN SCRIPT

longopts = ["from=", "feat=", "asc", "desc", "precs"]
args = read_options("f:adp", longopts, treat_options, 1, usage_string)
filetype.parse(args, StatsCollectorHandler(), input_filetype_ext)
print_stats()
Exemple #26
0
                self.add("\n")
            self.add(handled_type, ":\n")
            self.handled_type = handled_type
            self.counter = 0



###########################################################

def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global reference_fname
    global mwe_evaluator

    treat_options_simplest(opts, arg, n_arg, usage_string)


        
################################################################################  
# MAIN SCRIPT


if __name__ == "__main__":
    longopts = []
    args = read_options("", longopts, treat_options, -1, usage_string)
    parse(args, PrettyPrinterHandler())
Exemple #27
0
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global lemmapos
    global input_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )

    for ( o, a ) in opts:        
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True
        elif o in ("-p", "--lemmapos") : 
            lemmapos = True   
        elif o in ("-f", "--freq-source") : 
            freq_source = a
        elif o == "--from":
            input_filetype_ext = a                          
        else:
            raise Exception("Bad arg: " + o)

################################################################################     
# MAIN SCRIPT

longopts = [ "surface", "lemmapos", "freq-source=" "from=" ]
args = read_options( "spf:", longopts, treat_options, -1, usage_string )
handler = ft_ucs.UCSPrinter("candidates", freq_source=freq_source,
        lemmapos=lemmapos, surfaces=surface_instead_lemmas)
filetype.parse(args, handler, input_filetype_ext)
Exemple #28
0
        
        @param n_arg The number of arguments expected for this script.    
    """
    global ignore_pos
    global surface_instead_lemmas
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-g", "--ignore-pos"):
            ignore_pos = True
        elif o in ("-s", "--surface"):
            surface_instead_lemmas = True
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "ignore-pos", "surface"]
args = read_options("gst", longopts, treat_options, -1, usage_string)

filetype.parse(args, UniqerHandler(), input_filetype_ext)
Exemple #29
0
            output_filetype_ext = a
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":
            action_filter = True
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames, c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)


################################################################################
# MAIN SCRIPT

longopts = [
    "corpus-from=", "candidates-from=", "to=", "candidates=", "detector=",
    "gaps=", "source", "filter", "filter-and-annot"
]
arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
Exemple #30
0
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.
        
        @param usage_string The usage string for the current script.    
    """
    global attributes

    treat_options_simplest( opts, arg, n_arg, usage_string )
    
    for (o, a) in opts:
        if o in ("-a", "--attributes"):
            attributes = a.split(":")
            for attr in attributes:
                if attr not in WORD_ATTRIBUTES:
                    error("Unknown attribute '%s'!" % attr)

    if attributes is None:
        print >>sys.stderr, "The option -a <attributes> is mandatory."
        usage(usage_string)
        sys.exit(2)


################################################################################
# MAIN SCRIPT    

longopts = ["atttibutes="]
arg = read_options("a:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, TxtGeneratorHandler())
Exemple #31
0

################################################################################           
  
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global limit
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    for ( o, a ) in opts:
        if o in ("-n", "--number") :
            try :
                limit = int( a )
                if limit < 0 :
                    raise ValueError
            except ValueError :
                error("You must provide a positive integer value as argument "
                      "of -n option.")

################################################################################         
# MAIN SCRIPT

longopts = [ "number=" ]
args = read_options( "n:", longopts, treat_options, -1, usage_string )
filetype.parse(args, HistogramGeneratorHandler())
Exemple #32
0
        verb_table[ "google" ].sort( key=operator.itemgetter(3), reverse=True )
        verb_table["google"] = verb_table["google"][ 0:5 ]
        compl_table[ "google" ].sort( key=operator.itemgetter(1), reverse=True )
        compl_table["google"] = compl_table["google"][ 0:5 ]
        ent = entropy( probs_from_varfreqs( map( operator.itemgetter(0),
                       freq_table["google"] ) ) )
        ent_w = entropy( probs_weighted( map( operator.itemgetter(0),
                         freq_table["google"] ), map( operator.itemgetter(1,2,3),
                         freq_table["google"] ) ) )
        ent_w_verb = entropy( probs_weighted( map( operator.itemgetter(0),
                              compl_table["google"] ),
                              map( operator.itemgetter(1,2,3),
                              compl_table["google"] ) ) )
        ent_w_compl = entropy( probs_weighted( map( operator.itemgetter(0),
                               verb_table["google"] ),
                               map( operator.itemgetter(1,2,3),
                               verb_table["google"] ) ) )    
        candidate.add_feat( Feature( "entropy_google", str( ent ) ) )
        candidate.add_feat( Feature( "entropy_w_google", str( ent_w ) ) )
        candidate.add_feat( Feature( "entropy_w_verb_google", str( ent_w_verb ) ) )    
        candidate.add_feat( Feature( "entropy_w__compl_google", str( ent_w_compl )))    
        self.chain.handle_candidate(candidate, info)


################################################################################
# MAIN SCRIPT
longopts = []
args = read_options( "", longopts, treat_options_simplest, -1, usage_string )
filetype.parse(args, FeatGeneratorHandler())
Exemple #33
0
        if o in ("--from"):
            input_filetype_ext = a
        elif o in ("--to"):
            output_filetype_ext = a
        elif o == "--keep-empty-words":
            keep_empty_words = True
        elif o == "--word-lemmas":
            take_lemma = True
        elif o == "--word-lemmas-matching":
            regex_word_lemma = a
        elif o == "--word-surfaces-matching":
            regex_word_surface = a
        elif o == "--word-pos-matching":
            regex_word_pos = a
        elif o == "--word-syn-matching":
            regex_word_syn = a
        else:
            raise Exception("Bad arg")



################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "keep-empty-words", "word-lemmas",
        "word-lemmas-matching=", "word-surfaces-matching=",
        "word-pos-matching=", "word-syn-matching="]
args = read_options("", longopts, treat_options, -1, usage_string)
printer = SelectorPrinterHandler()
filetype.parse(args, printer, input_filetype_ext)
Exemple #34
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    @param usage_string Instructions that appear if you run the program with
    the wrong parameters or options.
    """
    global sent_split
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-s", "--sentence"):
            sent_split = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

longopts = ["sentence=", "to="]
args = read_options("s:", longopts, treat_options, -1, usage_string)
handler = filetype.AutomaticPrinterHandler(output_filetype_ext)
parser = ft_treetagger.TreeTaggerParser("utf-8", sent_split)
filetype.parse(args, handler, parser=parser)
Exemple #35
0
################################################################################


def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global limit

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-n", "--number"):
            try:
                limit = int(a)
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive integer value as argument "
                      "of -n option.")


################################################################################
# MAIN SCRIPT

longopts = ["number="]
args = read_options("n:", longopts, treat_options, -1, usage_string)
filetype.parse(args, HistogramGeneratorHandler())
Exemple #36
0
        @param n_arg The number of arguments expected for this script.
    """
    global combination
    global supported_combination
    global main_freq
    
    treat_options_simplest( opts, arg, n_arg, usage_string )
        
    for ( o, a ) in opts:
        if o in ( "-c", "--combination" ) :
            try :
                combination = []
                combination = interpret_combinations( a )
            except ValueError as message :
                print >> sys.stderr, message
                print >> sys.stderr, "ERROR: argument must be list separated"+ \
                                     "by \":\" and containing the names: "+\
                                     str( supported_combination )
                usage( usage_string )
                sys.exit( 2 )
        elif o in ( "-o", "--original" ) :
            main_freq = a
    
################################################################################
# MAIN SCRIPT

longopts = [ "combination=", "original=" ]
args = read_options( "c:o:", longopts, treat_options, -1, usage_string )

filetype.parse(args, FreqCombinerHandler())
Exemple #37
0
            owl_cand.append(form)
        owl_cand = "_".join(owl_cand) + "\"/>\n"
        self.add_string(owl_cand)


def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    mode = []
    for ( o, a ) in opts:
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True
        else:
            raise Exception("Bad arg: " + o)


################################################################################     
# MAIN SCRIPT

longopts = [ "surface" ]
args = read_options( "s", longopts, treat_options, -1, usage_string ) 
filetype.parse(args, OwlPrinter("candidates"))
Exemple #38
0
    for ( o, a ) in opts:
        if o in ( "-m", "--measures" ) :
            try :
                measures = []
                measures = interpret_measures( a )
            except ValueError as message :
                error( str(message)+"\nargument must be list separated by "
                                    "\":\" and containing the names: "+
                       str( supported_measures ))
        elif o in ( "-o", "--original" ) :
            main_freq_name = a
        elif o in ( "-a", "--all" ) :
            join_all_contrastive = True
    
    if not main_freq_name :
        error( "Option -o is mandatory")


################################################################################
# MAIN SCRIPT

longopts = ["measures=", "original=", "all"]
args = read_options( "m:o:a", longopts, treat_options, 1, usage_string )

for a in args :
    verbose( "Pass 1 for " + a )
    filetype.parse([a], TotalCalculatorHandler())
    # First calculate Nc for each contrastive corpus        
    verbose( "Pass 2 for " + a )    
    filetype.parse([a], MeasureCalculatorHandler())
Exemple #39
0
            suffix_array = index.load("lemma+pos")

    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
    #elif text_input and web_freq is None:
    #    warn("-x option is recommended for web queries, not textual indices")


################################################################################
# MAIN SCRIPT

longopts = ["candidates-from=", "corpus-from=", "to=",
            "yahoo", "google", "index=", "ignore-pos", "surface", "old",
            "lower=", "upper=", "vars", "lang=", "no-joint", "bigrams",
            "univ=", "web1t="]
args = read_options("ywi:gsoal:Jbu:T:", longopts,
        treat_options, -1, usage_string)

try:
    verbose("Counting ngrams in candidates file")
    filetype.parse(args, CounterPrinter(), filetype_candidates_ext)
finally:
    if web_freq:
        web_freq.flush_cache()  # VERY IMPORTANT!
Exemple #40
0
    """
    global executable_w
    global executable_beg
    global executable_end
    global input_filetype_ext
    global output_filetype_ext

    util.treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o == "--begin":
            executable_beg = compile(a, "<cmdline:--begin>", "exec")
        elif o == "--end":
            executable_end = compile(a, "<cmdline:--end>", "exec")
        elif o in ("-w", "--each-word"):
            executable_w = compile(a, "<cmdline:--each-word>", "exec")
        else:
            raise Exception("Bad arg " + o)


################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "begin=", "end=", "each-word="]
args = util.read_options("w:", longopts, treat_options, -1, usage_string)
filetype.parse(args, TransformHandler(), input_filetype_ext)
Exemple #41
0
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad flag")

    if non_overlapping and match_distance == "All":
        # If we are taking all matches, we need to be able to overlap...
        error(
            "Conflicting options: --match-distance=All and --non-overlapping")

    if len(mode) != 1:
        error("Exactly one option, -p or -n, must be provided")
    if "patterns" in mode:
        global patterns
        patterns = filetype.parse_entities([patterns_file])


################################################################################
# MAIN SCRIPT

longopts = [
    "from=", "to=", "patterns=", "ngram=", "index", "match-distance=",
    "non-overlapping", "freq", "ignore-pos", "surface", "source", "id-order="
]
arg = read_options("p:n:id:NfgsS", longopts, treat_options, -1, usage_string)
filetype.parse(arg, CandidatesGeneratorHandler(), input_filetype_ext)
Exemple #42
0
INFO = EvitaInfo()


class EvitaPrinter(filetype.common.AbstractPrinter):
    filetype_info = INFO
    valid_categories = ["candidates"]

    def handle_candidate(self, candidate, info={}):
        """For each `Candidate`, print the candidate ID, its POS pattern and the 
        list of occurrences one per line
        
        @param candidate The `Candidate` that is being read from the XML file.
        """
        pos = candidate.get_pos_pattern()
        pos = pos.replace(SEPARATOR, " ")
        self.add_string("candid=%(id)s pos=\"%(pos)s\"\n" % \
                {"id": candidate.id_number, "pos": pos})
        for form in candidate.occurs:
            form.set_all(lemma="", pos="")
            occur = form.to_string()
            occur = occur.replace(SEPARATOR, "")
            occur = occur.replace(WORD_SEPARATOR, " ")
            self.add_string(("\"%(occur)s\"\n" % {"occur": occur}).encode('utf-8'))
        self.add_string("\n")

################################################################################     
# MAIN SCRIPT

args = read_options("", [], treat_options_simplest, -1, usage_string)
filetype.parse(args, EvitaPrinter("candidates"))
Exemple #43
0
        if o in ("-m", "--measures"):
            try:
                measures = []
                measures = interpret_measures(a)
            except ValueError as message:
                error(
                    str(message) + "\nargument must be list separated by "
                    "\":\" and containing the names: " +
                    str(supported_measures))
        elif o in ("-o", "--original"):
            main_freq_name = a
        elif o in ("-a", "--all"):
            join_all_contrastive = True

    if not main_freq_name:
        error("Option -o is mandatory")


################################################################################
# MAIN SCRIPT

longopts = ["measures=", "original=", "all"]
args = read_options("m:o:a", longopts, treat_options, 1, usage_string)

for a in args:
    verbose("Pass 1 for " + a)
    filetype.parse([a], TotalCalculatorHandler())
    # First calculate Nc for each contrastive corpus
    verbose("Pass 2 for " + a)
    filetype.parse([a], MeasureCalculatorHandler())
Exemple #44
0
class EvitaPrinter(filetype.common.AbstractPrinter):
    filetype_info = INFO
    valid_categories = ["candidates"]

    def handle_candidate(self, candidate, info={}):
        """For each `Candidate`, print the candidate ID, its POS pattern and the 
        list of occurrences one per line
        
        @param candidate The `Candidate` that is being read from the XML file.
        """
        pos = candidate.get_pos_pattern()
        pos = pos.replace(SEPARATOR, " ")
        self.add_string("candid=%(id)s pos=\"%(pos)s\"\n" % \
                {"id": candidate.id_number, "pos": pos})
        for form in candidate.occurs:
            form.set_all(lemma="", pos="")
            occur = form.to_string()
            occur = occur.replace(SEPARATOR, "")
            occur = occur.replace(WORD_SEPARATOR, " ")
            self.add_string(("\"%(occur)s\"\n" % {
                "occur": occur
            }).encode('utf-8'))
        self.add_string("\n")


################################################################################
# MAIN SCRIPT

args = read_options("", [], treat_options_simplest, -1, usage_string)
filetype.parse(args, EvitaPrinter("candidates"))
Exemple #45
0
        
        @param n_arg The number of arguments expected for this script.    
    """
    global simplify
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    simplify = simplify_ptb

    for (o, a) in opts:
        if o in ("-p", "--palavras"):
            simplify = simplify_palavras
        elif o in ("-G", "--genia"):
            simplify = simplify_genia
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)


################################################################################
# MAIN SCRIPT

longopts = ["from=", "to=", "palavras", "genia"]
args = read_options("xF:pg", longopts, treat_options, -1, usage_string)
filetype.parse(args, FilterHandler(), input_filetype_ext)
Exemple #46
0
        elif o == "--filter":
            action_annotate = False
            action_filter = True
        elif o == "--filter-and-annot":            
            action_filter = True            
        else:
            raise Exception("Bad arg: " + o)

    if not candidates_fnames:
        error("No candidates file given!")
    if detector_class == SourceDetector and n_gaps is not None:
        error('Bad arguments: method "Source" with "--gaps"')
    c = CandidatesHandler()
    verbose("Reading MWE list from candidates file")
    filetype.parse(candidates_fnames,
            c, filetype_candidates_ext)
    verbose("MWE list loaded in memory successfully")
    global detector
    detector = detector_class(c.info, n_gaps)

        
################################################################################  
# MAIN SCRIPT


longopts = ["corpus-from=", "candidates-from=", "to=",
        "candidates=", "detector=", "gaps=", "source", "filter", 
        "filter-and-annot"]
arg = read_options("c:d:g:So:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, AnnotatorHandler(), filetype_corpus_ext)
Exemple #47
0
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.
        
        @param usage_string The usage string for the current script.    
    """
    global attributes

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-a", "--attributes"):
            attributes = a.split(":")
            for attr in attributes:
                if attr not in WORD_ATTRIBUTES:
                    error("Unknown attribute '%s'!" % attr)

    if attributes is None:
        print >> sys.stderr, "The option -a <attributes> is mandatory."
        usage(usage_string)
        sys.exit(2)


################################################################################
# MAIN SCRIPT

longopts = ["atttibutes="]
arg = read_options("a:", longopts, treat_options, -1, usage_string)
filetype.parse(arg, TxtGeneratorHandler())
Exemple #48
0
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name:
        filetype.parse([ref_name], ReferenceReaderHandler(),
                       reference_filetype_ext)
        gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name))
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs:
        error("You MUST provide a non-empty reference list!")


################################################################################
# MAIN SCRIPT

longopts = [
    "input-from=", "reference-from=", "reference=", "ignore-pos", "case",
    "lemma-or-surface"
]
args = read_options("r:gcL", longopts, treat_options, -1, usage_string)

filetype.parse(args, EvaluatorHandler(), input_filetype_ext)