Exemple #1
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global measures
    global supported_measures
    global main_freq_name
    global join_all_contrastive
    
    treat_options_simplest( opts, arg, n_arg, usage_string )
    
    for ( o, a ) in opts:
        if o in ( "-m", "--measures" ) :
            try :
                measures = []
                measures = interpret_measures( a )
            except ValueError as message :
                error( str(message)+"\nargument must be list separated by "
                                    "\":\" and containing the names: "+
                       str( supported_measures ))
        elif o in ( "-o", "--original" ) :
            main_freq_name = a
        elif o in ( "-a", "--all" ) :
            join_all_contrastive = True
    
    if not main_freq_name :
        error( "Option -o is mandatory")
Exemple #2
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.
        
        @param usage_string The usage string for the current script.    
    """
    global attributes

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-a", "--attributes"):
            attributes = a.split(":")
            for attr in attributes:
                if attr not in WORD_ATTRIBUTES:
                    error("Unknown attribute '%s'!" % attr)

    if attributes is None:
        print >> sys.stderr, "The option -a <attributes> is mandatory."
        usage(usage_string)
        sys.exit(2)
Exemple #3
0
    def handle_candidate(self, candidate, info={}):
        """For each candidate, stores it in a temporary Database (so that it can be
        retrieved later) and also creates a tuple containing the sorting key
        feature values and the candidate ID. All the tuples are stored in a
        global list, that will be sorted once all candidates are read and stored
        into the temporary DB.

        @param candidate The `Candidate` that is being read from the XML file.
        """
        global feat_list, all_feats, feat_list_ok, feat_to_order
        # First, verifies if all the features defined as sorting keys are real
        # features, by matching them against the meta-features of the header. This
        # is only performed once, before the first candidate is processed
        if not feat_list_ok:
            for feat_name in feat_list:
                if feat_name not in all_feats:
                    error(
                        "%(feat)s is not a valid feature\n"
                        + "Please chose features from the list below\n"
                        + "%(list)s" % {"feat": feat_name, "list": "\n".join(map(lambda x: "* " + x, all_feats))}
                    )
            feat_list_ok = True

        for tp_class in candidate.tpclasses:
            for feat_name in feat_list:
                feat_value = candidate.get_feat_value(feat_name)
                tp_value = candidate.get_tpclass_value(tp_class.name)
                if feat_value != UNKNOWN_FEAT_VALUE and tp_value != UNKNOWN_FEAT_VALUE:
                    tuple = (float(feat_value), tp_value == "True")
                    feat_to_order[tp_class.name][feat_name].append(tuple)
Exemple #4
0
    def search_terms(self, in_text, query):
        """            
        """
        if DEFAULT_LANG != "en":
            print("WARNING: Yahoo terms only works for English",
                  file=sys.stderr)
        input_text = in_text.strip()
        if isinstance(input_text, unicode):
            input_text = input_text.encode('utf-8')
        try:
            url = ('http://search.yahooapis.com/ContentAnalysisService/'
                   'V1/termExtraction')
            post_data = urllib.urlencode({
                "context": input_text,
                "appid": YAHOO_APPID,
                "query": query,
                "output": "json"
            })
            request = urllib2.Request(url, post_data)
            response = urllib2.urlopen(request)
            results = simplejson.load(response)
            return results["ResultSet"]["Result"]

        except Exception as err:
            error("Got an error ->" + str(err) +
                  "\nPLEASE VERIFY YOUR INTERNET CONNECTION")
Exemple #5
0
    def _fallback_entity(self, entity, info={}):
        """
            For each entity, stores it in a temporary Database (so that it can be
            retrieved later) and also creates a tuple containing the sorting key
            feature values and the entity ID. All the tuples are stored in a
            global list, that will be sorted once all candidates are read and stored
            into the temporary DB.
        """
        global feat_list
        # First, verifies if all the features defined as sorting keys are real
        # features, by matching them against the meta-features of the header. This
        # is only performed once, before the first entity is processed
        if not self.feat_list_ok:
            for feat_name in feat_list:
                if feat_name not in self.all_feats:
                    error("\"{feat}\" is not a valid feature\n" \
                            "Please chose features from the list below:\n" \
                            "{list}".format(feat=feat_name, list="\n".join(
                                    "* " + feat for feat in self.all_feats)))
            self.feat_list_ok = True

        # Store the whole entity in a temporary database
        #info['parser'] = info['fileobj'] = None
        self.all_entities[unicode(entity.id_number)] = (entity, info)

        # Build up a tuple to be added to a list.
        one_tuple = []
        for feat_name in feat_list:
            one_tuple.append(self.feat_value(entity, feat_name))
        # The tuple will contain the sorting key values and the
        # entity ID. The former are used to sort the candidates, the
        # latter is used to retrieve a entity from the temporary DB
        one_tuple.append(entity.id_number)
        self.feat_to_order.append(tuple(one_tuple))
Exemple #6
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global web_freq

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    for (o, a) in opts:
        if o in ("-y", "--yahoo"):
            web_freq = YahooFreq()
            mode.append("yahoo")
        elif o in ("-w", "--google"):
            web_freq = GoogleFreq()
            mode.append("google")

    if len(mode) > 1:
        error("At most one option -y or -w, should be provided")
Exemple #7
0
def read_distances( d_filename ) :
    """
        Reads the distances between categories from a tab-separated file and 
        generates a list of tuples which will, once the annotation file is 
        read, be converted into a category x category matrix. This needs to be
        done like this because, before reading the annotations file, we do not
        know how many categories Nk will be used. 
        
        @param d_filename The input file name from which the data is read
        @return A list of tuples containing, in the first position, category 1,
        in the second position, category 2, and in the third position a float
        with the distance between them.
    """
    try :
        d_data = open( d_filename )

        distances_map = {} # Use a map to remove duplicates if present
        for line in d_data.readlines() :
            if len(line.strip()) > 0 : # Ignore blank lines
                try :
                    cat1, cat2, distance = line.strip().split("\t")
                    key = "###SEPARATOR###".join( sorted( [ cat1, cat2 ] ) )
                    distances_map[ key ] = float( distance )
                except ValueError :
                    error("ERROR reading distances, expected three values "
                          "separated by TAB, found:\n" + line)
        return distances_map
    except IOError :
        error("\nERROR: Distance file \"%s\" not found" % d_filename)
Exemple #8
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global reference_fname
    global mwe_evaluator
    global corpus_filetype_ext
    global reference_filetype_ext

    sentence_aligner_class = NaiveSentenceAligner
    mwe_evaluator_class = ExactMatchMWEEvaluator

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-r", "--reference"):
            reference_fname = a
        elif o in ("--sentence-aligner"):
            sentence_aligner_class = SENTENCE_ALIGNERS[a]
        elif o in ("-e", "--evaluator"):
            mwe_evaluator_class = MWE_EVALUATORS[a]
        elif o == "--corpus-from":
            corpus_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if not reference_fname:
        error("No reference file given!")

    sentence_aligner = sentence_aligner_class()
    mwe_evaluator = mwe_evaluator_class(sentence_aligner)
Exemple #9
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global input_filetype_ext
    global output_filetype_ext
    global append_pos_tag
    global clean_special

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == ("--from"):
            input_filetype_ext = a
        elif o == ("--to"):
            output_filetype_ext = a
        elif o == "--append-pos-tag":
        	if a in ("coarse","fine"):
	            append_pos_tag = a
	        else:
	        	error("Expected \"coarse\" or \"fine\", found " + a)
        elif o == "--clean-special":
        	clean_special = True
        else:
            raise Exception("Bad arg: " + o)
Exemple #10
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.
        
        @param usage_string The usage string for the current script.    
    """
    global attributes

    treat_options_simplest( opts, arg, n_arg, usage_string )
    
    for (o, a) in opts:
        if o in ("-a", "--attributes"):
            attributes = a.split(":")
            for attr in attributes:
                if attr not in WORD_ATTRIBUTES:
                    error("Unknown attribute '%s'!" % attr)

    if attributes is None:
        print >>sys.stderr, "The option -a <attributes> is mandatory."
        usage(usage_string)
        sys.exit(2)
Exemple #11
0
    def _fallback_entity(self, entity, info={}):
        """
            For each entity, stores it in a temporary Database (so that it can be
            retrieved later) and also creates a tuple containing the sorting key
            feature values and the entity ID. All the tuples are stored in a
            global list, that will be sorted once all candidates are read and stored
            into the temporary DB.
        """
        global feat_list
        # First, verifies if all the features defined as sorting keys are real 
        # features, by matching them against the meta-features of the header. This
        # is only performed once, before the first entity is processed
        if not self.feat_list_ok:
            for feat_name in feat_list:
                if feat_name not in self.all_feats:
                    error("\"{feat}\" is not a valid feature\n" \
                            "Please chose features from the list below:\n" \
                            "{list}".format(feat=feat_name, list="\n".join(
                                    "* " + feat for feat in self.all_feats)))
            self.feat_list_ok = True

        # Store the whole entity in a temporary database
        #info['parser'] = info['fileobj'] = None
        self.all_entities[unicode(entity.id_number)] = (entity,info)

        # Build up a tuple to be added to a list.
        one_tuple = []
        for feat_name in feat_list:
            one_tuple.append(self.feat_value(entity, feat_name))
        # The tuple will contain the sorting key values and the
        # entity ID. The former are used to sort the candidates, the 
        # latter is used to retrieve a entity from the temporary DB
        one_tuple.append(entity.id_number)
        self.feat_to_order.append(tuple(one_tuple))
Exemple #12
0
def read_distances(d_filename):
    """
        Reads the distances between categories from a tab-separated file and 
        generates a list of tuples which will, once the annotation file is 
        read, be converted into a category x category matrix. This needs to be
        done like this because, before reading the annotations file, we do not
        know how many categories Nk will be used. 
        
        @param d_filename The input file name from which the data is read
        @return A list of tuples containing, in the first position, category 1,
        in the second position, category 2, and in the third position a float
        with the distance between them.
    """
    try:
        d_data = open(d_filename)

        distances_map = {}  # Use a map to remove duplicates if present
        for line in d_data.readlines():
            if len(line.strip()) > 0:  # Ignore blank lines
                try:
                    cat1, cat2, distance = line.strip().split("\t")
                    key = "###SEPARATOR###".join(sorted([cat1, cat2]))
                    distances_map[key] = float(distance)
                except ValueError:
                    error("ERROR reading distances, expected three values "
                          "separated by TAB, found:\n" + line)
        return distances_map
    except IOError:
        error("\nERROR: Distance file \"%s\" not found" % d_filename)
Exemple #13
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global algoname
    global lower_attr
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest( opts, arg, n_arg, usage_string )        

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-l","--lemmas" ) :
            lower_attr = "lemma"
        elif o in ("-a", "--algorithm"):
            algoname = a.lower()
        elif o in ("-m", "-x"):
        	error( "Deprecated options -x and -m. Run with -h for details" )
        else:
            raise Exception("Bad arg: " + o)
Exemple #14
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global web_freq

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    for ( o, a ) in opts:
        if o in ( "-y", "--yahoo" ):
            web_freq = YahooFreq()
            mode.append("yahoo")
        elif o in ( "-w", "--google" ):
            web_freq = GoogleFreq()
            mode.append("google")

    if len(mode) > 1:
        error("At most one option -y or -w, should be provided")
Exemple #15
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global reference_fname
    global mwe_evaluator
    global corpus_filetype_ext
    global reference_filetype_ext

    sentence_aligner_class = NaiveSentenceAligner
    mwe_evaluator_class = ExactMatchMWEEvaluator

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-r", "--reference"):
            reference_fname = a
        elif o in ("--sentence-aligner"):
            sentence_aligner_class = SENTENCE_ALIGNERS[a]
        elif o in ("-e", "--evaluator"):
            mwe_evaluator_class = MWE_EVALUATORS[a]
        elif o == "--corpus-from":
            corpus_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if not reference_fname:
        error("No reference file given!")

    sentence_aligner = sentence_aligner_class()
    mwe_evaluator = mwe_evaluator_class(sentence_aligner)
Exemple #16
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.    
    """
    global limit
    global entity_buffer
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for ( o, a ) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-n", "--number"):
            try:
                limit = int(a)
                entity_buffer = [None] * limit
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive " + \
                      "integer value as argument of -n option.")
        else:
            raise Exception("Bad arg: " + o)
Exemple #17
0
    def handle_candidate(self, candidate, info={}):
        """For each candidate, stores it in a temporary Database (so that it can be
        retrieved later) and also creates a tuple containing the sorting key
        feature values and the candidate ID. All the tuples are stored in a
        global list, that will be sorted once all candidates are read and stored
        into the temporary DB.

        @param candidate The `Candidate` that is being read from the XML file.
        """
        global feat_list, all_feats, feat_list_ok, feat_to_order
        # First, verifies if all the features defined as sorting keys are real
        # features, by matching them against the meta-features of the header. This
        # is only performed once, before the first candidate is processed
        if not feat_list_ok:
            for feat_name in feat_list:
                if feat_name not in all_feats:
                    error("%(feat)s is not a valid feature\n" + \
                          "Please chose features from the list below\n" + \
                          "%(list)s" % {"feat": feat_name,
                                        "list": "\n".join(
                                            map(lambda x: "* " + x, all_feats))})
            feat_list_ok = True

        for tp_class in candidate.tpclasses:
            for feat_name in feat_list:
                feat_value = candidate.get_feat_value(feat_name)
                tp_value = candidate.get_tpclass_value(tp_class.name)
                if feat_value != UNKNOWN_FEAT_VALUE and \
                   tp_value != UNKNOWN_FEAT_VALUE :
                    tuple = (float(feat_value), tp_value == "True")
                    feat_to_order[tp_class.name][feat_name].append(tuple)
Exemple #18
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global measures
    global supported_measures
    global main_freq_name
    global join_all_contrastive

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-m", "--measures"):
            try:
                measures = []
                measures = interpret_measures(a)
            except ValueError as message:
                error(
                    str(message) + "\nargument must be list separated by "
                    "\":\" and containing the names: " +
                    str(supported_measures))
        elif o in ("-o", "--original"):
            main_freq_name = a
        elif o in ("-a", "--all"):
            join_all_contrastive = True

    if not main_freq_name:
        error("Option -o is mandatory")
Exemple #19
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global limit
    global input_filetype_ext
    global output_filetype_ext
    
    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-n", "--number"):
            try:
                limit = int( a )
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive " \
                         "integer value as argument of -n option.")
        else:
            raise Exception("Bad arg")
Exemple #20
0
 def feat_value(self, entity, feat_name):
     r"""Return value for given feature name."""
     if feat_name.startswith("@"):
         if feat_name == "@SURFACE":
             return tuple(w.surface for w in entity)
         if feat_name == "@LEMMA":
             return tuple(w.lemma for w in entity)
         if feat_name == "@POS":
             return tuple(w.pos for w in entity)
         error("Bad pseudo-feature name", feat_name=feat_name)
     return entity.get_feat_value(feat_name)
Exemple #21
0
 def feat_value(self, entity, feat_name):
     r"""Return value for given feature name."""
     if feat_name.startswith("@"):
         if feat_name == "@SURFACE":
             return tuple(w.surface for w in entity)
         if feat_name == "@LEMMA":
             return tuple(w.lemma for w in entity)
         if feat_name == "@POS":
             return tuple(w.pos for w in entity)
         error("Bad pseudo-feature name", feat_name=feat_name)
     return entity.get_feat_value(feat_name)
Exemple #22
0
 def __init__(self):
     global algoname
     if algoname == "simple" : 
         self.handle_sentence = self.handle_sentence_simple # Redundant, kept for clarity
     elif algoname == "complex" :
         self.handle_sentence = self.handle_sentence_complex
     elif algoname == "aggressive" :
         self.handle_sentence = self.handle_sentence_aggressive # Redundant, kept for clarity                
     else :
         error("%s is not a valid algorithm\nYou must provide a valid "+\
               "algorithm (e.g. \"complex\", \"simple\")." % algoname)
Exemple #23
0
def calculate_distances(distances_map, all_categories):
    """
        Generates a distances matrix from the distances map and the 
        correspondence between nominal categories and their IDs. This function
        is called just after reading the data when a distances file is provided.
        
        @param distances_map A dictionary where the keys are strings of the form
        category1###SEPARATOR###category2 and the values are the distances
        between category1 and category1
        @param all_categories A dictionary where the keys are the string nominal
        category names and the values are the integer unique IDs of each 
        category
        @return A simmetric matrix Nk x Nk, with the distance between categories 
        represented in the cells. The rows and columns are indexed with the IDs
        from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values
        not specified in the distances_map are set to the maximum distance seen
        in the map by default. If no distance file is provided (distance_map is
        empty) the distances between each two different categories are 1.0.
    """
    Nk = len(all_categories.keys())
    distances_matrix = []
    max_distance = 0.0
    for k in range(Nk):
        distances_matrix.append(Nk * [-1.0])
    for key, distance in distances_map.items():
        cats = key.split("###SEPARATOR###")
        try:
            k1, k2 = map(lambda x: all_categories[x], cats)
        except KeyError:
            error("Distance file incompatible with annotations\nDid not find "
                  "categories %s in the annotation data" % cats)
            return None
        if k1 == k2:
            warn(
                "defined distance for category and self for %s. Replacing by 0"
                % cats[0])
        distances_matrix[k1][k2] = distance
        distances_matrix[k2][k1] = distance
        if distance > max_distance:
            max_distance = distance
    if len(distances_map.keys()) == 0:
        max_distance = 1.0
    # Fill in the non-specified distances with the maximal value
    for k1 in range(Nk):
        distances_matrix[k1][k1] = 0.0  # Distance between categ and itself = 0
        for k2 in range(Nk):
            if distances_matrix[k1][k2] < 0.0:  # Not specified or negative
                distances_matrix[k1][k2] = max_distance
    return distances_matrix
Exemple #24
0
def calculate_distances( distances_map, all_categories ) :
    """
        Generates a distances matrix from the distances map and the 
        correspondence between nominal categories and their IDs. This function
        is called just after reading the data when a distances file is provided.
        
        @param distances_map A dictionary where the keys are strings of the form
        category1###SEPARATOR###category2 and the values are the distances
        between category1 and category1
        @param all_categories A dictionary where the keys are the string nominal
        category names and the values are the integer unique IDs of each 
        category
        @return A simmetric matrix Nk x Nk, with the distance between categories 
        represented in the cells. The rows and columns are indexed with the IDs
        from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values
        not specified in the distances_map are set to the maximum distance seen
        in the map by default. If no distance file is provided (distance_map is
        empty) the distances between each two different categories are 1.0.
    """
    Nk = len( all_categories.keys() )
    distances_matrix = []
    max_distance = 0.0
    for k in range(Nk) :
        distances_matrix.append( Nk * [-1.0] )
    for key,distance in distances_map.items() :
        cats = key.split("###SEPARATOR###")
        try :
            k1,k2 = map(lambda x : all_categories[x], cats )
        except KeyError :
            error("Distance file incompatible with annotations\nDid not find "
                  "categories %s in the annotation data" % cats)
            return None
        if k1 == k2 :
            warn("defined distance for category and self for %s. Replacing by 0"
                 % cats[ 0 ])
        distances_matrix[ k1 ][ k2 ] = distance
        distances_matrix[ k2 ][ k1 ] = distance
        if distance > max_distance :
            max_distance = distance
    if len( distances_map.keys() ) == 0 :        
        max_distance = 1.0
    # Fill in the non-specified distances with the maximal value
    for k1 in range(Nk) :
        distances_matrix[k1][k1] = 0.0 # Distance between categ and itself = 0
        for k2 in range(Nk) :
            if distances_matrix[ k1 ][ k2 ] < 0.0 : # Not specified or negative
                distances_matrix[ k1 ][ k2 ] = max_distance     
    return distances_matrix
Exemple #25
0
def interpret_length(l, maxormin):
    """
    Transform argument given to -a or -i options into integer + error checks.

    @param l: A string passed as argument to -i or -a
    @param maxormin: A string indicating whether this is "maximum" or "minimum"
    @return: An integer corresponding to l
    """
    try:
        result = int(l)
        if result < 0:
            raise ValueError
        verbose("%s length: %d" % (maxormin, result))
        return result
    except ValueError:
        error("Argument of must be non-negative integer, got " + repr(l))
Exemple #26
0
def interpret_length( l, maxormin ):
    """
    Transform argument given to -a or -i options into integer + error checks.

    @param l: A string passed as argument to -i or -a
    @param maxormin: A string indicating whether this is "maximum" or "minimum"
    @return: An integer corresponding to l
    """
    try :
        result = int( l )
        if result < 0:
            raise ValueError
        verbose( "%s length: %d" % (maxormin, result) )
        return result
    except ValueError:
        error("Argument of must be non-negative integer, got " + repr(l))
Exemple #27
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global pre_gs
    global ignore_pos
    global gs_name
    global ignore_case
    global lemma_or_surface
    global input_filetype_ext
    global reference_filetype_ext
    ref_name = None

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-r", "--reference"):
            ref_name = a
        elif o in ("-g", "--ignore-pos"):
            ignore_pos = True
        elif o in ("-c", "--case"):
            ignore_case = False
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name:
        filetype.parse([ref_name], ReferenceReaderHandler(),
                       reference_filetype_ext)
        gs_name = re.sub(".*/", "", re.sub("\.xml", "", ref_name))
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs:
        error("You MUST provide a non-empty reference list!")
Exemple #28
0
def create_patterns_file( ngram_range ) :
    """
        Create an artificial list of MWE patterns in which all the parts of
        the words are wildcards. Such artificial patterns match every ngram
        of size n, which is exactly what we want to do with the option -n. This
        may seem a weird way to extract ngrams, but it allows a single 
        transparent candidate extraction function, treat_sentence.
        
        @param ngram_range String argument of the -n option.
    """        
    global patterns, usage_string, shortest_pattern, longest_pattern
    result = interpret_ngram( ngram_range )
    if result :
        ( shortest_pattern, longest_pattern ) = result
        patterns.append(build_generic_pattern(shortest_pattern, longest_pattern))
    else :
        error("Invalid argument for -n.")
Exemple #29
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global pre_gs
    global ignore_pos
    global gs_name
    global ignore_case
    global lemma_or_surface
    global input_filetype_ext
    global reference_filetype_ext
    ref_name = None
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    for ( o, a ) in opts:
        if o in ("-r", "--reference"):
             ref_name = a
        elif o in ("-g", "--ignore-pos"):
            ignore_pos = True
        elif o in ("-c", "--case"):
            ignore_case = False
        elif o in ("-L", "--lemma-or-surface"):
            lemma_or_surface = True
        elif o == "--input-from":
            input_filetype_ext = a
        elif o == "--reference-from":
            reference_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
            
    # The reference list needs to be opened after all the options are read,
    # since options such as -g and -c modify the way the list is represented
    if ref_name :
        filetype.parse([ref_name], ReferenceReaderHandler(), reference_filetype_ext)
        gs_name = re.sub( ".*/", "", re.sub( "\.xml", "", ref_name ) )
    # There's no reference list... Oh oh cannot evaluate :-(
    if not pre_gs :
        error("You MUST provide a non-empty reference list!")
Exemple #30
0
def create_patterns_file(ngram_range):
    """
        Create an artificial list of MWE patterns in which all the parts of
        the words are wildcards. Such artificial patterns match every ngram
        of size n, which is exactly what we want to do with the option -n. This
        may seem a weird way to extract ngrams, but it allows a single 
        transparent candidate extraction function, treat_sentence.
        
        @param ngram_range String argument of the -n option.
    """
    global patterns, usage_string, shortest_pattern, longest_pattern
    result = interpret_ngram(ngram_range)
    if result:
        (shortest_pattern, longest_pattern) = result
        patterns.append(
            build_generic_pattern(shortest_pattern, longest_pattern))
    else:
        error("Invalid argument for -n.")
Exemple #31
0
 def handle_meta(self, meta, info={}) :
     """
         Reads the `corpus_size` meta header and initializes a global counter
         dictionary with zero for each corpus. This dict will contain the total
         number of candidate frequencies summed up, as in the csmwe original
         formulation.
     
         @param meta The `Meta` header that is being read from the XML file.          
     """
     global totals_dict, main_freq_name
     main_freq_valid = False    
     for corpus_size in meta.corpus_sizes :
         totals_dict[ corpus_size.name ] = 0
         if corpus_size.name == main_freq_name :
             main_freq_valid = True    
     if not main_freq_valid :
         error("main frequency must be a valid freq. name\nPossible values: " +
               str( totals_dict.keys() ))
Exemple #32
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global glue
    global base_attr
    global min_ngram
    global max_ngram
    global min_frequency
    global ngram_counts
    global selected_candidates
    global use_shelve
    global input_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    for (o, a) in opts:
        if o in ("-s", "--surface"):
            surface_instead_lemmas = True
            base_attr = 'surface'
        elif o in ("-f", "--freq"):
            min_frequency = int(a)
        elif o in ("-n", "--ngram"):
            (min_ngram, max_ngram) = interpret_ngram(a)
        elif o in ("-G", "--glue"):
            if a == "scp":
                glue = scp_glue
            else:
                error("Unknown glue function '%s'" % a)
        elif o in ("-S", "--shelve"):
            use_shelve = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
Exemple #33
0
 def handle_meta(self, meta, info={}):
     """
         Reads the `corpus_size` meta header and initializes a global counter
         dictionary with zero for each corpus. This dict will contain the total
         number of candidate frequencies summed up, as in the csmwe original
         formulation.
     
         @param meta The `Meta` header that is being read from the XML file.          
     """
     global totals_dict, main_freq_name
     main_freq_valid = False
     for corpus_size in meta.corpus_sizes:
         totals_dict[corpus_size.name] = 0
         if corpus_size.name == main_freq_name:
             main_freq_valid = True
     if not main_freq_valid:
         error(
             "main frequency must be a valid freq. name\nPossible values: "
             + str(totals_dict.keys()))
Exemple #34
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global glue
    global base_attr
    global min_ngram
    global max_ngram
    global min_frequency
    global ngram_counts
    global selected_candidates
    global use_shelve
    global input_filetype_ext

    treat_options_simplest( opts, arg, n_arg, usage_string )

    mode = []
    for ( o, a ) in opts:
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True
            base_attr = 'surface'
        elif o in ("-f", "--freq") :
            min_frequency = int(a)
        elif o in ("-n", "--ngram") :
            (min_ngram, max_ngram) = interpret_ngram(a)
        elif o in ("-G", "--glue"):
            if a == "scp":
                glue = scp_glue
            else:
                error("Unknown glue function '%s'" % a)
        elif o in ("-S", "--shelve"):
            use_shelve = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
Exemple #35
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global limit
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    for ( o, a ) in opts:
        if o in ("-n", "--number") :
            try :
                limit = int( a )
                if limit < 0 :
                    raise ValueError
            except ValueError :
                error("You must provide a positive integer value as argument "
                      "of -n option.")
Exemple #36
0
    def handle_candidate(self, candidate, info={}):
        """For each candidate and for each `CorpusSize` read from the `Meta` 
        header, generates four features that correspond to the Association
        Measures described above.
        
        @param candidate The `Candidate` that is being read from the XML file.    
        """
        global corpussize_dict, main_freq

        joint_freq = {}
        singleword_freq = {}
        backed_off = False
        # Convert all these integers to floats...
        for freq in candidate.freqs:
            joint_freq[freq.name] = (float(abs(freq.value)))
            singleword_freq[freq.name] = []
            if freq.value < 0:
                backed_off = True
        for word in candidate:
            for freq in word.freqs:
                singleword_freq[freq.name].append(abs(float(freq.value)))
                # Little trick: negative counts indicate backed-off counts
                if freq.value < 0:
                    backed_off = True

        for freq in candidate.freqs:
            corpus_name = freq.name
            if not backed_off and corpus_name == "backoff":
                N = corpussize_dict[main_freq]
            else:
                N = corpussize_dict[corpus_name]
            try:
                feats = calculate_ams(joint_freq[corpus_name],
                                      singleword_freq[corpus_name], N,
                                      corpus_name)
                for feat in feats:
                    candidate.add_feat(feat)
            except Exception:
                error(
                    "This should never be printed. The end of the world is here"
                )

        self.chain.handle_candidate(candidate, info)
Exemple #37
0
def treat_options(opts, arg, n_arg, usage_string):
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global limit

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-n", "--number"):
            try:
                limit = int(a)
                if limit < 0:
                    raise ValueError
            except ValueError:
                error("You must provide a positive integer value as argument "
                      "of -n option.")
Exemple #38
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global input_patterns
    global input_filetype_ext
    global output_filetype_ext
    global match_distance
    global non_overlapping
    global id_order
    global annotate
    global only_the_matching_subpart

    util.treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--input-from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-p", "--patterns"):
            input_patterns = filetype.parse_entities([a])
        elif o in ("-d", "--match-distance") : 
            match_distance = a
        elif o in ("-N", "--non-overlapping") : 
            non_overlapping = True
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--annotate":
            annotate = True
        elif o == "--only-matching":
            only_the_matching_subpart = True
        else:
            raise Exception("Bad arg " + o)

    if input_patterns is None:
        util.error("No patterns provided. Option --patterns is mandatory!")

    if only_the_matching_subpart and annotate:
        util.warn("Switch --only-matching disables --annotate")
Exemple #39
0
def open_index(prefix):
    """
    Open the index files (valid index created by the `index.py` script). 
    @param prefix The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    assert prefix.endswith(".info")
    prefix = prefix[:-len(".info")]
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        error("Error opening the index.\nTry again with another index filename")
    except KeyError:
        error("Error opening the index.\nTry again with another index filename")
Exemple #40
0
def calculate_and_print(annotations, Ni, Nc, Nk, categ_names):
    """
        Given the set of annotations read from the files, calculate the
        agreement coefficients and print them in a nice way.
        
        @param annotations The list of annotations containing one row per item,
        one column per rater, and the nominal categories in the cells
        @param Ni The total number of items I in the data
        @param Nc The total number of raters C in the data        
        @param Nk The total number of categories K in the data
        @param categ_names The names of the categories used to annotate, sorted
        by their IDs.
    """
    global calculate_pairwise
    global calculate_confusion
    if Ni != 0 and Nc != 0 and Nk != 0:  # empty file
        if calculate_pairwise:
            pairwise_map = compute_pairwise_all(annotations, Ni, Nc, Nk)
            for pair in pairwise_map.keys():
                print("\nAgreement for pair " + pair)
                (a0, S, pi, kappa, wkappa) = pairwise_map[pair]
                print("ao = %f, S = %f, pi = %f, (Cohen's) kappa = %f, "
                      "weighted kappa = %f" % (a0, S, pi, kappa, wkappa))
            print_matrix_kappa(pairwise_map, Nc)
        print ( "\nNc = %(Nc)d raters\nNi = %(Ni)d items\nNk = %(Nk)d " + \
              "categories\nNc x Ni = %(j)d judgements" ) %\
              {"Nc": Nc, "Ni": Ni, "Nk": Nk, "j": Ni * Nc }
        coeffs = compute_multi(annotations, Ni, Nc, Nk)
        print("\nOverall agreement coefficients for all annotators:")
        print(
            "multi-ao = %f\nmulti-pi (Fleiss' kappa) = %f\nmulti-kappa = %f\n"
            % coeffs)
        coeffs_weighted = compute_weighted_multi(annotations, Ni, Nc, Nk)
        print("Weighted agreement coefficients for all annotators:")
        print("alpha = %f\nalpha-kappa = %f\n" % coeffs_weighted)
        if calculate_confusion:
            confusion, counters = compute_confusion(annotations, Nc)
            print_matrix_confusion(confusion, categ_names, counters, Ni, Nc,
                                   Nk)
    else:
        error("you probably provided an empty file")
Exemple #41
0
    def handle_candidate(self, candidate, info={}):
        """For each candidate and for each `CorpusSize` read from the `Meta` 
        header, generates four features that correspond to the Association
        Measures described above.
        
        @param candidate The `Candidate` that is being read from the XML file.    
        """
        global corpussize_dict, main_freq

        joint_freq = {}
        singleword_freq = {}
        backed_off = False
        # Convert all these integers to floats...
        for freq in candidate.freqs :
            joint_freq[ freq.name ] = ( float(abs( freq.value ) ) )
            singleword_freq[ freq.name ] = []
            if freq.value < 0 :
                backed_off = True
        for word in candidate :
            for freq in word.freqs :
                singleword_freq[ freq.name ].append( abs( float(freq.value) ) )
                # Little trick: negative counts indicate backed-off counts
                if freq.value < 0 :
                    backed_off = True
        
        for freq in candidate.freqs :
            corpus_name = freq.name
            if not backed_off and corpus_name == "backoff" :
                N = corpussize_dict[ main_freq ]
            else :
                N = corpussize_dict[ corpus_name ]
            try :
                feats = calculate_ams( joint_freq[ corpus_name ],
                        singleword_freq[ corpus_name ],
                        N, corpus_name )
                for feat in feats :
                    candidate.add_feat( feat )
            except Exception :
                error( "This should never be printed. The end of the world is here")

        self.chain.handle_candidate(candidate, info)
Exemple #42
0
def calculate_and_print( annotations, Ni, Nc, Nk, categ_names ) :
    """
        Given the set of annotations read from the files, calculate the
        agreement coefficients and print them in a nice way.
        
        @param annotations The list of annotations containing one row per item,
        one column per rater, and the nominal categories in the cells
        @param Ni The total number of items I in the data
        @param Nc The total number of raters C in the data        
        @param Nk The total number of categories K in the data
        @param categ_names The names of the categories used to annotate, sorted
        by their IDs.
    """
    global calculate_pairwise
    global calculate_confusion
    if Ni != 0 and Nc != 0 and Nk != 0 : # empty file
        if calculate_pairwise :
            pairwise_map = compute_pairwise_all( annotations, Ni, Nc, Nk)
            for pair in pairwise_map.keys() :
                print("\nAgreement for pair " + pair)
                (a0, S, pi, kappa, wkappa) = pairwise_map[pair]
                print("ao = %f, S = %f, pi = %f, (Cohen's) kappa = %f, "
                      "weighted kappa = %f" % (a0, S, pi, kappa, wkappa) )
            print_matrix_kappa( pairwise_map, Nc )
        print ( "\nNc = %(Nc)d raters\nNi = %(Ni)d items\nNk = %(Nk)d " + \
              "categories\nNc x Ni = %(j)d judgements" ) %\
              {"Nc": Nc, "Ni": Ni, "Nk": Nk, "j": Ni * Nc }
        coeffs = compute_multi( annotations, Ni, Nc, Nk )
        print("\nOverall agreement coefficients for all annotators:")
        print("multi-ao = %f\nmulti-pi (Fleiss' kappa) = %f\nmulti-kappa = %f\n"
              % coeffs)
        coeffs_weighted = compute_weighted_multi( annotations, Ni, Nc, Nk )
        print("Weighted agreement coefficients for all annotators:")
        print("alpha = %f\nalpha-kappa = %f\n" % coeffs_weighted)
        if calculate_confusion :
            confusion, counters = compute_confusion( annotations, Nc )
            print_matrix_confusion(confusion, categ_names, counters, Ni, Nc, Nk)
    else :
        error("you probably provided an empty file")
Exemple #43
0
def open_index(prefix):
    """
    Open the index files (valid index created by the `index.py` script). 
    @param prefix The string name of the index file.
    """
    global freq_name, the_corpus_size
    global index, suffix_array
    assert prefix.endswith(".info")
    prefix = prefix[:-len(".info")]
    try:
        verbose("Loading index files... this may take some time.")
        index = Index(prefix)
        index.load_metadata()
        freq_name = re.sub(".*/", "", prefix)
        #pdb.set_trace()
        the_corpus_size = index.metadata["corpus_size"]
    except IOError:
        error(
            "Error opening the index.\nTry again with another index filename")
    except KeyError:
        error(
            "Error opening the index.\nTry again with another index filename")
Exemple #44
0
 def handle_candidate(self, candidate, info={}) :
     """
         For each candidate and for each `CorpusSize` read from the `Meta` 
         header, generates features that correspond to the Contrastive
         Measures described above.
         
         @param candidate The `Candidate` that is being read from the XML file.    
     """
     global corpussize_dict
     global totals_dict
     global main_freq_name
     # get the original corpus freq, store the others in contrastive corpus dict
     # We use plus one smoothing to avoid dealing with zero freqs    
     contrast_freqs = {}
     if join_all_contrastive :
         contrast_freqs[ "all" ] = 1
     main_freq = None
     for freq in candidate.freqs :
         if freq.name == main_freq_name :
             main_freq = float( freq.value ) + 1 
         elif join_all_contrastive :
             contrast_freqs[ "all" ] += float( freq.value )
         else :
             contrast_freqs[ freq.name ] = float( freq.value ) + 1
     
     for contrast_name in contrast_freqs.keys() :                    
         try :            
             feats = calculate_indiv( corpussize_dict[ main_freq_name ],
                                      corpussize_dict[ contrast_name ],
                                      main_freq, 
                                      contrast_freqs[ contrast_name ], 
                                      totals_dict[ contrast_name ],                                      
                                      contrast_name )
             for feat in feats :
                 candidate.add_feat( feat )
         except Exception :
             error("Error in calculating the measures.")
     self.chain.handle_candidate(candidate, info)
Exemple #45
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global measures
    global supported_measures
    global main_freq
    global not_normalize_mle
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-m", "--measures"):
            try:
                measures = interpret_measures(a)
            except ValueError as message:
                error(
                    str(message) + "\nargument must be list separated by "
                    "\":\" and containing the names: " +
                    str(supported_measures))
        elif o in ("-o", "--original"):
            main_freq = a
        elif o in ("-u", "--unnorm-mle"):
            not_normalize_mle = True
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
Exemple #46
0
    def handle_candidate(self, candidate, info={}):
        """
            For each candidate and for each `CorpusSize` read from the `Meta` 
            header, generates features that correspond to the Contrastive
            Measures described above.
            
            @param candidate The `Candidate` that is being read from the XML file.    
        """
        global corpussize_dict
        global totals_dict
        global main_freq_name
        # get the original corpus freq, store the others in contrastive corpus dict
        # We use plus one smoothing to avoid dealing with zero freqs
        contrast_freqs = {}
        if join_all_contrastive:
            contrast_freqs["all"] = 1
        main_freq = None
        for freq in candidate.freqs:
            if freq.name == main_freq_name:
                main_freq = float(freq.value) + 1
            elif join_all_contrastive:
                contrast_freqs["all"] += float(freq.value)
            else:
                contrast_freqs[freq.name] = float(freq.value) + 1

        for contrast_name in contrast_freqs.keys():
            try:
                feats = calculate_indiv(corpussize_dict[main_freq_name],
                                        corpussize_dict[contrast_name],
                                        main_freq,
                                        contrast_freqs[contrast_name],
                                        totals_dict[contrast_name],
                                        contrast_name)
                for feat in feats:
                    candidate.add_feat(feat)
            except Exception:
                error("Error in calculating the measures.")
        self.chain.handle_candidate(candidate, info)
Exemple #47
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global feat_list
    global ascending
    global print_precs

    treat_options_simplest(opts, arg, n_arg, usage_string)

    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one"+\
             " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")
Exemple #48
0
 def search_terms( self, in_text, query ) : 
     """            
     """   
     if DEFAULT_LANG != "en" :
         print("WARNING: Yahoo terms only works for English",file=sys.stderr)
     input_text = in_text.strip()
     if isinstance( input_text, unicode ) :
         input_text = input_text.encode( 'utf-8' )
     try:
         url = ('http://search.yahooapis.com/ContentAnalysisService/'
                'V1/termExtraction' )
         post_data = urllib.urlencode( { "context": input_text,
                                         "appid": YAHOO_APPID,
                                         "query": query,
                                         "output": "json" } ) 
         request = urllib2.Request( url, post_data )
         response = urllib2.urlopen(request)
         results = simplejson.load(response)
         return results[ "ResultSet" ][ "Result" ]
         
     except Exception as err:
         error( "Got an error ->" + str(err) +
                "\nPLEASE VERIFY YOUR INTERNET CONNECTION")
Exemple #49
0
    def handle_meta(self, meta, info={}):
        """Treats the meta information of the file. Besides of printing the meta
        header out, it also keeps track of all the meta-features. The list of
        `all_feats` will be used in order to verify that all key features have a
        valid meta-feature. This is important because we need to determine the
        correct type of the feature value, since it might influence sorting
        order (e.g. integers 1 < 2 < 10 but strings "1" < "10" < "2")

        @param meta The `Meta` header that is being read from the XML file.
        """
        global all_feats, usage_string, feat_to_order
        for meta_feat in meta.meta_feats:
            if meta_feat.feat_type in ("integer", "real"):
                all_feats.append(meta_feat.name)
        tp_classes_ok = False
        for meta_tp in meta.meta_tpclasses:
            if meta_tp.feat_type == "{True,False}":
                tp_classes_ok = True
                feat_to_order[meta_tp.name] = {}
                for feat_name in all_feats:
                    feat_to_order[meta_tp.name][feat_name] = []
        if not tp_classes_ok:
            error("You must define a boolean TP class")
Exemple #50
0
    def handle_meta(self, meta, info={}):
        """Treats the meta information of the file. Besides of printing the meta
        header out, it also keeps track of all the meta-features. The list of
        `all_feats` will be used in order to verify that all key features have a
        valid meta-feature. This is important because we need to determine the
        correct type of the feature value, since it might influence sorting
        order (e.g. integers 1 < 2 < 10 but strings "1" < "10" < "2")

        @param meta The `Meta` header that is being read from the XML file.
        """
        global all_feats, usage_string, feat_to_order
        for meta_feat in meta.meta_feats:
            if meta_feat.feat_type in ("integer", "real"):
                all_feats.append(meta_feat.name)
        tp_classes_ok = False
        for meta_tp in meta.meta_tpclasses:
            if meta_tp.feat_type == "{True,False}":
                tp_classes_ok = True
                feat_to_order[meta_tp.name] = {}
                for feat_name in all_feats:
                    feat_to_order[meta_tp.name][feat_name] = []
        if not tp_classes_ok:
            error("You must define a boolean TP class")
Exemple #51
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global measures
    global supported_measures
    global main_freq
    global not_normalize_mle
    global input_filetype_ext
    global output_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )
        
    for ( o, a ) in opts:
        if o in ( "-m", "--measures" ) :
            try :
                measures = interpret_measures( a )
            except ValueError as message :
                error( str(message) + "\nargument must be list separated by "
                                      "\":\" and containing the names: " +
                       str( supported_measures ))
        elif o in ( "-o", "--original" ) :
            main_freq = a
        elif o in ( "-u", "--unnorm-mle" ) :
            not_normalize_mle = True
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)
Exemple #52
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global feat_list
    global ascending
    global print_precs

    treat_options_simplest(opts, arg, n_arg, usage_string)

    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")
Exemple #53
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global used_attributes
    global basename
    global build_entry
    global use_text_format
    global input_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    used_attributes = ["lemma", "pos", "surface", "syn"]
    for (o, a) in opts:
        if o in ("-i", "--index"):
            basename = a
        elif o == "--from":
            input_filetype_ext = a
        elif o in ("-a", "--attributes"):
            used_attributes = a.split(":")
        elif o in ("-m", "--moses"):
            use_text_format = "moses"
        elif o in ("-c", "--conll"):
            use_text_format = "conll"
        elif o in ("-o", "--old"):
            indexlib.Index.use_c_indexer(False)

    if basename is None:
        error("You must provide a filename for the index.\n"
              "Option -i is mandatory.")
Exemple #54
0
def treat_options_csv2xml( opts, arg, n_arg, usage_string ):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.   
    """
    
    global SEPCHAR
    global SURFACE_FLAG
    
    for ( o , a ) in opts:
        if o == "-F":
            # sets a new separator character to be used when spliting a line
            SEPCHAR = a
        elif o == "-s":
            # sets the assignment of a word to the "surface" item.
            # default is set to "lemma".
            SURFACE_FLAG = 1
        else:
            error("Option " + o + " is not a valid option")
Exemple #55
0
def treat_options_csv2xml(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.   
    """

    global SEPCHAR
    global SURFACE_FLAG

    for (o, a) in opts:
        if o == "-F":
            # sets a new separator character to be used when spliting a line
            SEPCHAR = a
        elif o == "-s":
            # sets the assignment of a word to the "surface" item.
            # default is set to "lemma".
            SURFACE_FLAG = 1
        else:
            error("Option " + o + " is not a valid option")
Exemple #56
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global glue
    global corpus_from_index
    global base_attr
    global min_ngram
    global max_ngram
    global min_frequency

    treat_options_simplest( opts, arg, n_arg, usage_string )

    mode = []
    for ( o, a ) in opts:
        if o in ("-s", "--surface") : 
            surface_instead_lemmas = True
            base_attr = 'surface'
        elif o in ("-f", "--freq") :
            min_frequency = int(a)
        elif o in ("-n", "--ngram") :
            (min_ngram, max_ngram) = interpret_ngram(a)
        elif o in ("-i", "--index") :
            corpus_from_index = True
        elif o in ("-G", "--glue"):
            if a == "scp":
                glue = scp_glue
            else:
                error("Unknown glue function '%s'" % a)
Exemple #57
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global surface_instead_lemmas
    global glue
    global corpus_from_index
    global base_attr
    global min_ngram
    global max_ngram
    global min_frequency

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    for (o, a) in opts:
        if o in ("-s", "--surface"):
            surface_instead_lemmas = True
            base_attr = 'surface'
        elif o in ("-f", "--freq"):
            min_frequency = int(a)
        elif o in ("-n", "--ngram"):
            (min_ngram, max_ngram) = interpret_ngram(a)
        elif o in ("-i", "--index"):
            corpus_from_index = True
        elif o in ("-G", "--glue"):
            if a == "scp":
                glue = scp_glue
            else:
                error("Unknown glue function '%s'" % a)