Example #1
0
def open_index(prefix):
    """
    Open the index files (valid index created by the `index.py` script). 
    @param prefix The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    assert prefix.endswith(".info")
    prefix = prefix[:-len(".info")]
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        error("Error opening the index.\nTry again with another index filename")
    except KeyError:
        error("Error opening the index.\nTry again with another index filename")
Example #2
0
def open_index(prefix):
    """
    Open the index files (valid index created by the `index.py` script). 
    @param prefix The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    assert prefix.endswith(".info")
    prefix = prefix[:-len(".info")]
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        error(
            "Error opening the index.\nTry again with another index filename")
    except KeyError:
        error(
            "Error opening the index.\nTry again with another index filename")
Example #3
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global cache_file, get_freq_function, build_entry, web_freq
    global the_corpus_size, freq_name
    global low_limit, up_limit
    global count_vars
    global language
    global suffix_array
    global count_joint_frequency
    global count_bigrams
    global web1t_data_path
    global filetype_corpus_ext
    global filetype_candidates_ext
    global output_filetype_ext

    surface_flag = False
    ignorepos_flag = False
    mode = []

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for ( o, a ) in opts:
        if o in ( "-i", "--index" ):
            open_index(a)
            get_freq_function = get_freq_index
            mode.append("index")
        elif o in ( "-y", "--yahoo" ):
            error("THIS OPTION IS DEPRECATED AS YAHOO SHUT DOWN THEIR FREE "
                  "SEARCH API")
            #web_freq = YahooFreq()          
            #freq_name = "yahoo"
            #ignorepos_flag = True 
            #the_corpus_size = web_freq.corpus_size()         
            #get_freq_function = get_freq_web
            #mode.append( "yahoo" )   
        elif o in ( "-w", "--google" ):
            web_freq = GoogleFreq()
            freq_name = "google"
            ignorepos_flag = True
            the_corpus_size = web_freq.corpus_size()
            get_freq_function = get_freq_web
            mode.append("google")
        elif o in ( "-u", "--univ" ):
            web_freq = GoogleFreqUniv(a)
            freq_name = "google"
            ignorepos_flag = True
            the_corpus_size = web_freq.corpus_size()
            get_freq_function = get_freq_web
            mode.append("google")
        elif o in ("-T", "--web1t"):
            ignorepos_flag = True
            freq_name = "web1t"
            web1t_data_path = a
            the_corpus_size = int(read_file(web1t_data_path + "/1gms/total"))
            get_freq_function = get_freq_web1t
            mode.append("web1t")
        elif o in ("-s", "--surface" ):
            surface_flag = True
        elif o in ("-g", "--ignore-pos"):
            ignorepos_flag = True
        elif o in ("--lower", "--upper" ):
            try:
                limit = int(a)
                if limit < 0:
                    raise ValueError, "Argument of " + o + " must be positive"
                if o == "--lower" :
                    if up_limit == -1 or up_limit >= limit:
                        low_limit = limit
                    else:
                        raise ValueError, "Argument of -f >= argument of -t"
                else:
                    if low_limit == -1 or low_limit <= limit:
                        up_limit = limit
                    else:
                        raise ValueError, "Argument of -t <= argument of -t"
            except ValueError as message:
                error( str(message) + "\nArgument of " + o + " must be integer")
        elif o in ("-a", "--vars" ):
            count_vars = True
        elif o in ("-l", "--lang" ):
            language = a
        elif o in ("-J", "--no-joint"):
            count_joint_frequency = False
        elif o in ("-B", "--bigrams"):
            count_bigrams = True
        elif o in ("-o", "--old"):
            Index.use_c_indexer(False)
        elif o == "--corpus-from":
            filetype_corpus_ext = a
        elif o == "--candidates-from":
            filetype_candidates_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if mode == ["index"]:
        if surface_flag and ignorepos_flag:
            build_entry = lambda surface, lemma, pos: surface
            suffix_array = index.load("surface")
        elif surface_flag:
            build_entry = lambda surface, lemma, pos: surface +\
                                                      ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("surface+pos")
        elif ignorepos_flag:
            build_entry = lambda surface, lemma, pos: lemma
            suffix_array = index.load("lemma")
        else:
            build_entry = lambda surface, lemma, pos: lemma +\
                                                      ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("lemma+pos")

    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")
Example #4
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global cache_file, get_freq_function, build_entry, web_freq
    global the_corpus_size, freq_name
    global low_limit, up_limit
    global count_vars
    global language
    global suffix_array
    global count_joint_frequency
    global count_bigrams
    global web1t_data_path
    global filetype_corpus_ext
    global filetype_candidates_ext
    global output_filetype_ext

    surface_flag = False
    ignorepos_flag = False
    mode = []

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-i", "--index"):
            open_index(a)
            get_freq_function = get_freq_index
            mode.append("index")
        elif o in ("-y", "--yahoo"):
            error("THIS OPTION IS DEPRECATED AS YAHOO SHUT DOWN THEIR FREE "
                  "SEARCH API")
            #web_freq = YahooFreq()
            #freq_name = "yahoo"
            #ignorepos_flag = True
            #the_corpus_size = web_freq.corpus_size()
            #get_freq_function = get_freq_web
            #mode.append( "yahoo" )
        elif o in ("-w", "--google"):
            web_freq = GoogleFreq()
            freq_name = "google"
            ignorepos_flag = True
            the_corpus_size = web_freq.corpus_size()
            get_freq_function = get_freq_web
            mode.append("google")
        elif o in ("-u", "--univ"):
            web_freq = GoogleFreqUniv(a)
            freq_name = "google"
            ignorepos_flag = True
            the_corpus_size = web_freq.corpus_size()
            get_freq_function = get_freq_web
            mode.append("google")
        elif o in ("-T", "--web1t"):
            ignorepos_flag = True
            freq_name = "web1t"
            web1t_data_path = a
            the_corpus_size = int(read_file(web1t_data_path + "/1gms/total"))
            get_freq_function = get_freq_web1t
            mode.append("web1t")
        elif o in ("-s", "--surface"):
            surface_flag = True
        elif o in ("-g", "--ignore-pos"):
            ignorepos_flag = True
        elif o in ("--lower", "--upper"):
            try:
                limit = int(a)
                if limit < 0:
                    raise ValueError, "Argument of " + o + " must be positive"
                if o == "--lower":
                    if up_limit == -1 or up_limit >= limit:
                        low_limit = limit
                    else:
                        raise ValueError, "Argument of -f >= argument of -t"
                else:
                    if low_limit == -1 or low_limit <= limit:
                        up_limit = limit
                    else:
                        raise ValueError, "Argument of -t <= argument of -t"
            except ValueError as message:
                error(str(message) + "\nArgument of " + o + " must be integer")
        elif o in ("-a", "--vars"):
            count_vars = True
        elif o in ("-l", "--lang"):
            language = a
        elif o in ("-J", "--no-joint"):
            count_joint_frequency = False
        elif o in ("-B", "--bigrams"):
            count_bigrams = True
        elif o in ("-o", "--old"):
            Index.use_c_indexer(False)
        elif o == "--corpus-from":
            filetype_corpus_ext = a
        elif o == "--candidates-from":
            filetype_candidates_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if mode == ["index"]:
        if surface_flag and ignorepos_flag:
            build_entry = lambda surface, lemma, pos: surface
            suffix_array = index.load("surface")
        elif surface_flag:
            build_entry = lambda surface, lemma, pos: surface +\
                                                      ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("surface+pos")
        elif ignorepos_flag:
            build_entry = lambda surface, lemma, pos: lemma
            suffix_array = index.load("lemma")
        else:
            build_entry = lambda surface, lemma, pos: lemma +\
                                                      ATTRIBUTE_SEPARATOR + pos
            suffix_array = index.load("lemma+pos")

    else:  # Web search, entries are single surface or lemma forms
        if surface_flag:
            build_entry = lambda surface, lemma, pos: surface
        else:
            build_entry = lambda surface, lemma, pos: lemma

    if len(mode) != 1:
        error("Exactly one option -u, -w or -i, must be provided")