def before_file(self, fileobj, info={}): if self.chain is None: self.chain = self.make_printer(info, None) self.chain.before_file(fileobj, info) m = Meta(None, None, None) m.add_corpus_size(CorpusSize("corpus", corpus_size)) m.add_meta_feat(MetaFeat("glue", "real")) self.chain.handle_meta(m)
def handle_meta(self, meta, info={}) : """ Adds two new meta-features corresponding to the features that we add to each candidate. The meta-features define the type of the features, which is an enumeration of all possible POS patterns for the POS pattern and an integer number for the size n of the candidate. @param meta The `Meta` header that is being read from the XML file. """ global all_patterns pattern_feat_values = "{" for pattern_value in all_patterns.keys() : pattern_feat_values = pattern_feat_values + pattern_value + "," pattern_feat_values = pattern_feat_values[0:len(pattern_feat_values) - 1] pattern_feat_values = pattern_feat_values + "}" meta.add_meta_feat( MetaFeat( "pos_pattern", pattern_feat_values ) ) meta.add_meta_feat( MetaFeat( "n", "integer" ) ) meta.add_meta_feat( MetaFeat( "capitalized", "{UPPERCASE,Firstupper,lowercase,MiXeD}" ) ) meta.add_meta_feat( MetaFeat( "hyphen", "{True,False}" ) ) self.chain.handle_meta(meta, info)
def handle_meta(self, meta, info={}): """ Adds two new meta-features corresponding to the features that we add to each candidate. The meta-features define the type of the features, which is an enumeration of all possible POS patterns for the POS pattern and an integer number for the size n of the candidate. @param meta The `Meta` header that is being read from the XML file. """ global all_patterns pattern_feat_values = "{" for corpus_size in meta.corpus_sizes : meta.add_meta_feat( MetaFeat( "entropy_" + corpus_size.name, "real" ) ) self.chain.handle_meta(meta, info)
def handle_meta(self, meta, info={}): """Adds new meta-features corresponding to the AM features that we add to each candidate. The meta-features define the type of the features, which is a real number for each of the 4 AMs in each corpus. @param meta The `Meta` header that is being read from the XML file. """ global corpussize_dict global measures for corpus_size in meta.corpus_sizes: corpussize_dict[corpus_size.name] = float(corpus_size.value) for corpus_size in meta.corpus_sizes: for meas in measures: meta.add_meta_feat( MetaFeat(meas + "_" + corpus_size.name, "real")) self.chain.handle_meta(meta, info)
def main(): """ Main function. """ global corpus_size_f if corpus_from_index: index = Index(corpus_path) index.load_main() for sentence in index.iterate_sentences(): treat_sentence(sentence) else: input_file = open(corpus_path) parser = xml.sax.make_parser() parser.setContentHandler(CorpusXMLHandler(treat_sentence)) parser.parse(input_file) input_file.close() corpus_size_f = float(corpus_size) localmaxs() verbose("Outputting candidates file...") print(XML_HEADER % {"category": "candidates", "ns": ""}) meta = Meta([CorpusSize("corpus", corpus_size)], [MetaFeat("glue", "real")], []) print(meta.to_xml().encode('utf-8')) id = 0 for ngram in select: if (len(ngram) >= min_ngram and len(ngram) <= max_ngram and select[ngram] and ngram_counts[ngram] >= min_frequency): dump_ngram(ngram, id) id += 1 print(XML_FOOTER % {"category": "candidates"})
def getMeta(filename): """ Generates the <meta> section of the .xml file. The process stops if the input file do not have the same number of columns of data for each line. @param filename String of a file's name to be processed. """ global corpora global features global tpclasses f = open(filename, "r") # get the file header, so we can start processing line = f.readline() # escapes special characters line = strip_xml(line) header = string.split(line.strip("\n"), SEPCHAR) # create a Meta object to be printed in the end objectMeta = Meta([], [], []) # add corpus size data to Meta for corpus in corpora: objectCorpusSize = CorpusSize(str(corpus), str(DEFAULT_CORPUS_SIZE)) objectMeta.add_corpus_size(objectCorpusSize) # maps a feature (name) to it's proper type (int, float, string or list) featType = dict([(feature, set()) for feature in features]) # maps a tpclass (name) to a set of types tpclassType = dict([(tpclass, set()) for tpclass in tpclasses]) # get the features' and the tpclasses' types lineCounter = 0 for row in f: lineCounter = lineCounter + 1 # escapes special characters line = strip_xml(row) line = string.split(line.strip("\n"), SEPCHAR) if len(line) != len(header): error("the number of columns in line " + str(lineCounter) + " and header is different") for feature in features: #get feature value feat = line[indexes[feature]] if isInt(feat): featType[feature] = "int" elif isFloat(feat): featType[feature] = "float" else: # while the threshold is not reached, the feature type is a # list of elements if featType[feature] != "string": featType[feature].add(feat) # threshold reached, feature type is assigned to string if len(featType[feature]) > THRESHOLD: featType[feature] = "string" #get tpclass types for tpclass in tpclasses: tpclassType[tpclass].add(line[indexes[tpclass]]) # creates a metafeat object to be added to the meta object for feature in features: if featType[feature] not in ["int", "float", "string"]: featType[feature] = setToString(featType[feature]) objectMetaFeat = MetaFeat(feature, featType[feature]) objectMeta.add_meta_feat(objectMetaFeat) # creates a tpclass object to be added to the meta object for tpclass in tpclassType: tpclassName = tpclass.split("_")[1] tpclassType[tpclass] = setToString(tpclassType[tpclass]) objectMetaTPClass = MetaTPClass(tpclassName, tpclassType[tpclass]) objectMeta.add_meta_feat(objectMetaTPClass) # prints the meta object print(objectMeta.to_xml().encode('utf-8'))
def startElement( self, name, attrs ) : """ Treats starting tags in candidates XML file, overwrites default SAX dummy function. @param name The name of the opening element. @param attrs Dictionary containing the attributes of this element. """ if name == "cand" : # Get the candidate ID or else create a new ID for it if "candid" in attrs.keys() : id_number = strip_xml( attrs[ "candid" ] ) else : id_number = self.id_number_counter self.id_number_counter = self.id_number_counter + 1 # Instanciates an empty mwe candidate that will be treated # when the <cand> tag is closed self.candidate = Candidate( id_number, None, [], [], [], [] ) elif name == "ngram" : # Instanciates a new ngram. We do not know which words it # contains, so for the moment we just keep it on the stack self.ngram = Ngram( [], [] ) elif name == "bigrams" : self.inbigram = True elif name == "occurs" : self.inoccurs = True elif name == "vars" : self.invars = True elif name == "w" : # Instanciates a word. Missing attribute values are # assigned to a wildcard string, meaning "uninformed" for # candidates or "any" for patterns if "surface" in attrs.keys() : surface = strip_xml( attrs[ "surface" ] ) else : surface = WILDCARD if "lemma" in attrs.keys() : lemma = strip_xml( attrs[ "lemma" ] ) else : lemma = WILDCARD if "pos" in attrs.keys() : pos = strip_xml( attrs[ "pos" ] ) else : pos = WILDCARD self.word = Word( surface, lemma, pos, WILDCARD, [] ) # Add the word to the ngram that is on the stack self.ngram.append( self.word ) elif name == "freq" : self.freq = Frequency( strip_xml( attrs[ "name" ] ), int( strip_xml( attrs[ "value" ] ) ) ) # If <freq> is inside a word element, then it's the word's # frequency, otherwise it corresponds to the frequency of # the ngram that is being read if self.word : self.word.add_frequency( self.freq ) else : self.ngram.add_frequency( self.freq ) elif name == "sources": self.ngram.add_sources(attrs["ids"].split(';')) elif name == "feat" : feat_name = strip_xml( attrs[ "name" ] ) feat_value = strip_xml( attrs[ "value" ] ) feat_type = self.meta.get_feat_type( feat_name ) if feat_type == "integer" : feat_value = int( feat_value ) elif feat_type == "real" : feat_value = float( feat_value ) f = Feature( feat_name, feat_value ) self.candidate.add_feat( f ) elif name == "tpclass" : tp = TPClass( strip_xml( attrs[ "name" ] ), strip_xml( attrs[ "value" ] ) ) self.candidate.add_tpclass( tp ) # Meta section and elements, correspond to meta-info about the # candidates lists. Meta-info are important for generating # features and converting to arff files, and must correspond # to the info in the candidates (e.g. meta-feature has the # same name as actual feature) elif name == "meta" : self.meta = Meta( [], [], [] ) elif name == "corpussize" : cs = CorpusSize( attrs[ "name" ], attrs[ "value" ] ) self.meta.add_corpus_size( cs ) elif name == "metafeat" : mf = MetaFeat( attrs[ "name" ], attrs[ "type" ] ) self.meta.add_meta_feat( mf ) elif name == "metatpclass" : mtp = MetaTPClass( attrs[ "name" ], attrs[ "type" ] ) self.meta.add_meta_tpclass( mtp ) elif name == "candidates" and self.gen_xml : print(XML_HEADER % { "root" : self.gen_xml, "ns" : "" })
def startElement(self, name, attrs): """ Treats starting tags in dictionary XML file, overwrites default SAX dummy function. @param name The name of the opening element. @param attrs Dictionary containing the attributes of this element. """ if name == "entry": # Get the candidate ID or else create a new ID for it if "entryid" in attrs.keys(): id_number = strip_xml(attrs["entryid"]) else: id_number = self.id_number_counter self.id_number_counter = self.id_number_counter + 1 # Instanciates an empty dict entry that will be treated # when the <entry> tag is closed self.entry = Entry(id_number, [], [], []) elif name == "w": if ("surface" in attrs.keys()): surface = strip_xml(attrs["surface"]) else: surface = WILDCARD if ("lemma" in attrs.keys()): lemma = strip_xml(attrs["lemma"]) else: lemma = WILDCARD if ("pos" in attrs.keys()): pos = strip_xml(attrs["pos"]) else: pos = WILDCARD if ("syn" in attrs.keys()): syn = strip_xml(attrs["syn"]) else: syn = WILDCARD self.word = Word(surface, lemma, pos, syn, []) self.entry.append(self.word) elif name == "freq": self.freq = Frequency(strip_xml(attrs["name"]), int(strip_xml(attrs["value"]))) # If <freq> is inside a word element, then it's the word's # frequency, otherwise it corresponds to the frequency of # the ngram that is being read if self.word: self.word.add_frequency(self.freq) else: self.entry.add_frequency(self.freq) elif name == "feat": feat_name = strip_xml(attrs["name"]) feat_value = strip_xml(attrs["value"]) feat_type = self.meta.get_feat_type(feat_name) if feat_type == "integer": feat_value = int(feat_value) elif feat_type == "real": feat_value = float(feat_value) f = Feature(feat_name, feat_value) self.entry.add_feat(f) # Meta section and elements, correspond to meta-info about the # reference lists. Meta-info are important for generating # features and converting to arff files, and must correspond # to the info in the dictionary (e.g. meta-feature has the # same name as actual feature) elif name == "meta": self.meta = Meta([], [], []) elif name == "corpussize": cs = CorpusSize(attrs["name"], attrs["value"]) self.meta.add_corpus_size(cs) elif name == "metafeat": mf = MetaFeat(attrs["name"], attrs["type"]) self.meta.add_meta_feat(mf) elif name == "dict" and self.gen_xml: print(XML_HEADER % {"root": self.gen_xml, "ns": ""})