Example #1
0
 def before_file(self, fileobj, info={}):
     if self.chain is None:
         self.chain = self.make_printer(info, None)
         self.chain.before_file(fileobj, info)
         m = Meta(None, None, None)
         m.add_corpus_size(CorpusSize("corpus", corpus_size))
         m.add_meta_feat(MetaFeat("glue", "real"))
         self.chain.handle_meta(m)
Example #2
0
 def handle_meta(self, meta, info={}) :
     """
         Adds two new meta-features corresponding to the features that we add to
         each candidate. The meta-features define the type of the features, which
         is an enumeration of all possible POS patterns for the POS pattern and
         an integer number for the size n of the candidate.
         
         @param meta The `Meta` header that is being read from the XML file.        
     """
     global all_patterns
     pattern_feat_values = "{"
     for pattern_value in all_patterns.keys() :
         pattern_feat_values = pattern_feat_values + pattern_value + ","
     pattern_feat_values = pattern_feat_values[0:len(pattern_feat_values) - 1] 
     pattern_feat_values = pattern_feat_values + "}"    
     meta.add_meta_feat( MetaFeat( "pos_pattern", pattern_feat_values ) ) 
     meta.add_meta_feat( MetaFeat( "n", "integer" ) )
     meta.add_meta_feat( MetaFeat( "capitalized", "{UPPERCASE,Firstupper,lowercase,MiXeD}" ) )    
     meta.add_meta_feat( MetaFeat( "hyphen", "{True,False}" ) )        
     self.chain.handle_meta(meta, info)
Example #3
0
 def handle_meta(self, meta, info={}):
     """
         Adds two new meta-features corresponding to the features that we add to
         each candidate. The meta-features define the type of the features, which
         is an enumeration of all possible POS patterns for the POS pattern and
         an integer number for the size n of the candidate.
         
         @param meta The `Meta` header that is being read from the XML file.        
     """
     global all_patterns
     pattern_feat_values = "{"
     for corpus_size in meta.corpus_sizes :
         meta.add_meta_feat( MetaFeat( "entropy_" + corpus_size.name, "real" ) )        
     self.chain.handle_meta(meta, info)
Example #4
0
 def handle_meta(self, meta, info={}):
     """Adds new meta-features corresponding to the AM features that we add to
     each candidate. The meta-features define the type of the features, which
     is a real number for each of the 4 AMs in each corpus.
     
     @param meta The `Meta` header that is being read from the XML file.       
     """
     global corpussize_dict
     global measures
     for corpus_size in meta.corpus_sizes:
         corpussize_dict[corpus_size.name] = float(corpus_size.value)
     for corpus_size in meta.corpus_sizes:
         for meas in measures:
             meta.add_meta_feat(
                 MetaFeat(meas + "_" + corpus_size.name, "real"))
     self.chain.handle_meta(meta, info)
Example #5
0
def main():
    """
        Main function.
    """
    global corpus_size_f

    if corpus_from_index:
        index = Index(corpus_path)
        index.load_main()
        for sentence in index.iterate_sentences():
            treat_sentence(sentence)
    else:
        input_file = open(corpus_path)
        parser = xml.sax.make_parser()
        parser.setContentHandler(CorpusXMLHandler(treat_sentence))
        parser.parse(input_file)
        input_file.close()

    corpus_size_f = float(corpus_size)

    localmaxs()

    verbose("Outputting candidates file...")
    print(XML_HEADER % {"category": "candidates", "ns": ""})

    meta = Meta([CorpusSize("corpus", corpus_size)],
                [MetaFeat("glue", "real")], [])
    print(meta.to_xml().encode('utf-8'))

    id = 0

    for ngram in select:
        if (len(ngram) >= min_ngram and len(ngram) <= max_ngram
                and select[ngram] and ngram_counts[ngram] >= min_frequency):
            dump_ngram(ngram, id)
            id += 1

    print(XML_FOOTER % {"category": "candidates"})
Example #6
0
def getMeta(filename):
    """
        Generates the <meta> section of the .xml file. The process stops
        if the input file do not have the same number of columns of data
        for each line.
        
        @param filename String of a file's name to be processed.
    """

    global corpora
    global features
    global tpclasses

    f = open(filename, "r")

    # get the file header, so we can start processing
    line = f.readline()
    # escapes special characters
    line = strip_xml(line)
    header = string.split(line.strip("\n"), SEPCHAR)

    # create a Meta object to be printed in the end
    objectMeta = Meta([], [], [])

    # add corpus size data to Meta
    for corpus in corpora:
        objectCorpusSize = CorpusSize(str(corpus), str(DEFAULT_CORPUS_SIZE))
        objectMeta.add_corpus_size(objectCorpusSize)

    # maps a feature (name) to it's proper type (int, float, string or list)
    featType = dict([(feature, set()) for feature in features])

    # maps a tpclass (name) to a set of types
    tpclassType = dict([(tpclass, set()) for tpclass in tpclasses])

    # get the features' and the tpclasses' types
    lineCounter = 0
    for row in f:
        lineCounter = lineCounter + 1

        # escapes special characters
        line = strip_xml(row)
        line = string.split(line.strip("\n"), SEPCHAR)

        if len(line) != len(header):
            error("the number of columns in line " + str(lineCounter) +
                  " and header is different")
        for feature in features:
            #get feature value
            feat = line[indexes[feature]]
            if isInt(feat):
                featType[feature] = "int"
            elif isFloat(feat):
                featType[feature] = "float"
            else:
                # while the threshold is not reached, the feature type is a
                # list of elements
                if featType[feature] != "string":
                    featType[feature].add(feat)
                # threshold reached, feature type is assigned to string
                if len(featType[feature]) > THRESHOLD:
                    featType[feature] = "string"
        #get tpclass types
        for tpclass in tpclasses:
            tpclassType[tpclass].add(line[indexes[tpclass]])

    # creates a metafeat object to be added to the meta object
    for feature in features:
        if featType[feature] not in ["int", "float", "string"]:
            featType[feature] = setToString(featType[feature])
        objectMetaFeat = MetaFeat(feature, featType[feature])
        objectMeta.add_meta_feat(objectMetaFeat)

    # creates a tpclass object to be added to the meta object
    for tpclass in tpclassType:
        tpclassName = tpclass.split("_")[1]
        tpclassType[tpclass] = setToString(tpclassType[tpclass])
        objectMetaTPClass = MetaTPClass(tpclassName, tpclassType[tpclass])
        objectMeta.add_meta_feat(objectMetaTPClass)

    # prints the meta object
    print(objectMeta.to_xml().encode('utf-8'))
    def startElement( self, name, attrs ) :
        """
            Treats starting tags in candidates XML file, overwrites default SAX 
            dummy function.
            
            @param name The name of the opening element.
            
            @param attrs Dictionary containing the attributes of this element.
        """
        if name == "cand" :  
            # Get the candidate ID or else create a new ID for it          
            if "candid" in attrs.keys() :
                id_number = strip_xml( attrs[ "candid" ] )
            else :
                id_number = self.id_number_counter
                self.id_number_counter = self.id_number_counter + 1
            # Instanciates an empty mwe candidate that will be treated
            # when the <cand> tag is closed
            self.candidate = Candidate( id_number, None, [], [], [], [] )
        elif name == "ngram" :
            # Instanciates a new ngram. We do not know which words it
            # contains, so for the moment we just keep it on the stack
            self.ngram = Ngram( [], [] )
        elif name == "bigrams" :
            self.inbigram = True
        elif name == "occurs" :
            self.inoccurs = True
        elif name == "vars" :
            self.invars = True
        elif name == "w" :
            # Instanciates a word. Missing attribute values are 
            # assigned to a wildcard string, meaning "uninformed" for
            # candidates or "any" for patterns
            if "surface" in attrs.keys() :
                surface = strip_xml( attrs[ "surface" ] )
            else :
                surface = WILDCARD
            if "lemma" in attrs.keys() :
                lemma = strip_xml( attrs[ "lemma" ] )
            else :
                lemma = WILDCARD
            if "pos" in attrs.keys() :
                pos = strip_xml( attrs[ "pos" ] )
            else :
                pos = WILDCARD
            self.word = Word( surface, lemma, pos, WILDCARD, [] )
            # Add the word to the ngram that is on the stack
            self.ngram.append( self.word )
        elif name == "freq" :
            self.freq = Frequency( strip_xml( attrs[ "name" ] ), 
                                   int( strip_xml( attrs[ "value" ] ) ) )
            # If <freq> is inside a word element, then it's the word's
            # frequency, otherwise it corresponds to the frequency of
            # the ngram that is being read
            if self.word :
                self.word.add_frequency( self.freq )            
            else :
                self.ngram.add_frequency( self.freq )

        elif name == "sources":
            self.ngram.add_sources(attrs["ids"].split(';'))

        elif name == "feat" :
            feat_name = strip_xml( attrs[ "name" ] )
            feat_value = strip_xml( attrs[ "value" ] )
            feat_type = self.meta.get_feat_type( feat_name )
            if feat_type == "integer" :
                feat_value = int( feat_value )
            elif feat_type == "real" :
                feat_value = float( feat_value )                
            f = Feature( feat_name, feat_value )
            self.candidate.add_feat( f ) 
        elif name == "tpclass" :
            tp = TPClass( strip_xml( attrs[ "name" ] ), 
                          strip_xml( attrs[ "value" ] ) )
            self.candidate.add_tpclass( tp )
            
        # Meta section and elements, correspond to meta-info about the
        # candidates lists. Meta-info are important for generating
        # features and converting to arff files, and must correspond
        # to the info in the candidates (e.g. meta-feature has the 
        # same name as actual feature)      
        elif name == "meta" :
            self.meta = Meta( [], [], [] )
        elif name == "corpussize" :      
            cs = CorpusSize( attrs[ "name" ], attrs[ "value" ] )      
            self.meta.add_corpus_size( cs )
        elif name == "metafeat" :      
            mf = MetaFeat( attrs[ "name" ], attrs[ "type" ] )     
            self.meta.add_meta_feat( mf )  
        elif name == "metatpclass" :    
            mtp = MetaTPClass( attrs[ "name" ], attrs[ "type" ] )        
            self.meta.add_meta_tpclass( mtp )
        elif name == "candidates" and self.gen_xml :
            print(XML_HEADER % { "root" : self.gen_xml, "ns" : "" })
Example #8
0
 def startElement(self, name, attrs):
     """
         Treats starting tags in dictionary XML file, overwrites
         default SAX dummy function.
         
         @param name The name of the opening element.
         
         @param attrs Dictionary containing the attributes of this element.
     """
     if name == "entry":
         # Get the candidate ID or else create a new ID for it
         if "entryid" in attrs.keys():
             id_number = strip_xml(attrs["entryid"])
         else:
             id_number = self.id_number_counter
             self.id_number_counter = self.id_number_counter + 1
         # Instanciates an empty dict entry that will be treated
         # when the <entry> tag is closed
         self.entry = Entry(id_number, [], [], [])
     elif name == "w":
         if ("surface" in attrs.keys()):
             surface = strip_xml(attrs["surface"])
         else:
             surface = WILDCARD
         if ("lemma" in attrs.keys()):
             lemma = strip_xml(attrs["lemma"])
         else:
             lemma = WILDCARD
         if ("pos" in attrs.keys()):
             pos = strip_xml(attrs["pos"])
         else:
             pos = WILDCARD
         if ("syn" in attrs.keys()):
             syn = strip_xml(attrs["syn"])
         else:
             syn = WILDCARD
         self.word = Word(surface, lemma, pos, syn, [])
         self.entry.append(self.word)
     elif name == "freq":
         self.freq = Frequency(strip_xml(attrs["name"]),
                               int(strip_xml(attrs["value"])))
         # If <freq> is inside a word element, then it's the word's
         # frequency, otherwise it corresponds to the frequency of
         # the ngram that is being read
         if self.word:
             self.word.add_frequency(self.freq)
         else:
             self.entry.add_frequency(self.freq)
     elif name == "feat":
         feat_name = strip_xml(attrs["name"])
         feat_value = strip_xml(attrs["value"])
         feat_type = self.meta.get_feat_type(feat_name)
         if feat_type == "integer":
             feat_value = int(feat_value)
         elif feat_type == "real":
             feat_value = float(feat_value)
         f = Feature(feat_name, feat_value)
         self.entry.add_feat(f)
     # Meta section and elements, correspond to meta-info about the
     # reference lists. Meta-info are important for generating
     # features and converting to arff files, and must correspond
     # to the info in the dictionary (e.g. meta-feature has the
     # same name as actual feature)
     elif name == "meta":
         self.meta = Meta([], [], [])
     elif name == "corpussize":
         cs = CorpusSize(attrs["name"], attrs["value"])
         self.meta.add_corpus_size(cs)
     elif name == "metafeat":
         mf = MetaFeat(attrs["name"], attrs["type"])
         self.meta.add_meta_feat(mf)
     elif name == "dict" and self.gen_xml:
         print(XML_HEADER % {"root": self.gen_xml, "ns": ""})