def big_test(version="3.0", max_length=3): from topicmod.util.wordnet import load_wn from nltk.corpus import brown from nltk.util import ingrams wn = load_wn(version) term_counts = defaultdict(int) for ngram_length in xrange(max_length): token = 0 for w in ingrams(brown.words(), ngram_length): token += 1 normalized = "_".join(w).lower() if wn.synsets(normalized, 'n'): term_counts[wn.morphy(normalized)] += 1 filename = "wn/wordnet.wn" if version != "3.0": filename = "wn/wordnet_%s.wn" % version o = OntologyWriter(filename) for ii in orderedTraversal(wn): o.AddSynset(ii.offset, ii.name, [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()], [(0, x.name.lower(), term_counts[x.name] + 1) for x in ii.lemmas]) o.Finalize()
def big_test(version="3.0", max_length=3): from topicmod.util.wordnet import load_wn from nltk.corpus import brown from nltk.util import ingrams wn = load_wn(version) term_counts = defaultdict(int) for ngram_length in xrange(max_length): token = 0 for w in ingrams(brown.words(), ngram_length): token += 1 normalized = "_".join(w).lower() if wn.synsets(normalized, 'n'): term_counts[wn.morphy(normalized)] += 1 filename = "wn/wordnet.wn" if version != "3.0": filename = "wn/wordnet_%s.wn" % version o = OntologyWriter(filename) for ii in orderedTraversal(wn): o.AddSynset(ii.offset, ii.name, [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()], [(0, x.name.lower(), term_counts[x.name] + 1) for x in ii.lemmas]) o.Finalize()
def read_german(self, directory="../../data/germanet/"): old_wn = load_wn("2.0") new_wn = load_wn("3.0") self._mapping["de"] = defaultdict(set) for ii in glob(directory + "/ILI*"): print "Reading mapping from ", ii for jj in codecs.open(ii, 'r', "latin-1"): fields = jj.split() word = fields[0] if word.startswith("$"): print "Spurious symbol: %s" % word.encode( "ascii", "ignore") word = word.replace("$", "") if word.startswith("?"): print "Spurious symbol: %s" % word.encode( "ascii", "ignore") word = word.replace("?", "") fields = fields[4:] fields.reverse() while fields: try: link_type = fields.pop() eng_word = fields.pop() eng_sense = fields.pop() synset = fields.pop() except IndexError: print "Pop error:", jj.encode("ascii", 'ignore'), fields break if synset.startswith("ENG20"): vers, offset, pos = synset.split("-") assert vers == "ENG20", "Wrong version of WordNet: %s" % vers else: if "-" in synset: offset, pos = synset.split("-") else: continue new_synset = find_equiv(pos, eng_word, offset, old_wn, new_wn) if new_synset and link_type in flags.gn_valid_relations: self._mapping["de"][new_synset.name].add(word.lower())
def load_wn(self, location, version): globals()["wn"] = load_wn(version, location)
print "Pop error:", jj.encode("ascii", 'ignore'), fields break if synset.startswith("ENG20"): vers, offset, pos = synset.split("-") assert vers == "ENG20", "Wrong version of WordNet: %s" % vers else: if "-" in synset: offset, pos = synset.split("-") else: continue new_synset = find_equiv(pos, eng_word, offset, old_wn, new_wn) if new_synset and link_type in flags.gn_valid_relations: self._mapping["de"][new_synset.name].add(word.lower()) # Load all the languages we have as a test if __name__ == "__main__": flags.InitFlags() #gn = GermaNet() mapping = MultilingMapping() mapping.read_german() wn = load_wn() print[list(mapping.related_words(x)) for x in wn.synsets("dog")]
word_senses_count[word] = 0 count_word += 1 tmp = word for pos in multipaths[word]: tmp += '\t' + pos for index in multipaths[word][pos]: word_senses_count[word] += 1 count_sense += 1 tmp += '\t' + str(index) if word_senses_count[word] > 1: im_words += word + " " outfile.write(tmp + '\n') outfile.write("\nThe total number of cons words: " + str(count_word) + "\n") outfile.write("\nThe total number of cons words senses: " + str(count_sense) + "\n") outfile.write("\nInteresting words: " + im_words + "\n") outfile.close() flags.define_string("vocab", None, "The input vocab") flags.define_string("output", None, "The output constraint file") flags.define_int("num_cons", 0, "The number of constraints we want") if __name__ == "__main__": flags.InitFlags() wordnet_path = "../../../data/wordnet/" eng_wn = load_wn("3.0", wordnet_path, "wn") vocab = readVocab(flags.vocab) generateCons(vocab, eng_wn, flags.output, flags.num_cons)