def loadPickledFiles(): global g_wnWords, g_wnWordIndex, g_wnDictPath, g_wnIndexPath, g_wnWordsPath print "loading wn dictionary data files" if not arsutils.fFileExists(g_wnDictPath): print "WordNet dictionary file '%s' doesn't exist" % g_wnDictPath return False if not arsutils.fFileExists(g_wnIndexPath): print "WordNet dictionary file '%s' doesn't exist" % g_wnIndexPath return False if not arsutils.fFileExists(g_wnWordsPath): print "WordNet dictionary file '%s' doesn't exist" % g_wnWordsPath return False try: fo = open(g_wnIndexPath, "rb") g_wnWordIndex = cPickle.load(fo) fo.close() fo = open(g_wnWordsPath, "rb") g_wnWords = cPickle.load(fo) fo.close() except Exception, ex: print arsutils.exceptionAsStr(ex) return False
def doInfoMan(outDir): global proSynsets ensureDir(outDir) # synchronize those file names with dictionary.py DICT_FILE = "wn-dict.txt" INDEX_FILE = "wn-words-index.pic" WORDS_FILE = "wn-words.pic" dictPath = os.path.join(outDir, DICT_FILE) indexPath = os.path.join(outDir, INDEX_FILE) wordsPath = os.path.join(outDir, WORDS_FILE) if arsutils.fFileExists(dictPath) and arsutils.fFileExists( indexPath) and arsutils.fFileExists(wordsPath): print "All files already exist. Nothing to do" return for fileName in dataFiles: print "groking file: %s" % fileName grok_one_data_file(fileName) if len(unparsed) > 0: print "Unparsed:", len(unparsed) for i in unparsed: print i all_words = {} print "generating word defs" for synset in proSynsets: word_def = synset.getTxtMarked() for w in synset.words: if all_words.has_key(w): # TODO: some better sorting of this data? all_words[w] = all_words[w] + word_def else: all_words[w] = word_def print "sorting words" sortedWords = all_words.keys() sortedWords.sort() print "words: %d" % len(sortedWords) print "writing %s" % dictPath dictDataFo = open(dictPath, "wb") wordIndex = {} curOffset = 0 for word in sortedWords: wordDef = all_words[word] defLen = len(wordDef) wordIndex[word] = (curOffset, defLen) curOffset += defLen dictDataFo.write(wordDef) dictDataFo.close() print "pickling %s" % indexPath dictIndexFo = open(indexPath, "wb") cPickle.dump(wordIndex, dictIndexFo, protocol=cPickle.HIGHEST_PROTOCOL) dictIndexFo.close() print "pickling %s" % wordsPath dictWordsFo = open(wordsPath, "wb") cPickle.dump(sortedWords, dictWordsFo, protocol=cPickle.HIGHEST_PROTOCOL) dictWordsFo.close()
fo = open(g_wnIndexPath, "rb") g_wnWordIndex = cPickle.load(fo) fo.close() fo = open(g_wnWordsPath, "rb") g_wnWords = cPickle.load(fo) fo.close() except Exception, ex: print arsutils.exceptionAsStr(ex) return False print "Finished loading WordNet files" global g_thWords, g_thWordIndex, g_thDictPath, g_thIndexPath, g_thWordsPath print "loading th dictionary data files" if not arsutils.fFileExists(g_thDictPath): print "Thesaurus dictionary file '%s' doesn't exist" % g_thDictPath return False if not arsutils.fFileExists(g_thIndexPath): print "Thesaurus dictionary file '%s' doesn't exist" % g_thIndexPath return False if not arsutils.fFileExists(g_thWordsPath): print "Thesaurus dictionary file '%s' doesn't exist" % g_thWordsPath return False try: fo = open(g_thIndexPath, "rb") g_thWordIndex = cPickle.load(fo) fo.close()
def doInfoMan(outDir): global proSynsets ensureDir(outDir) # synchronize those file names with dictionary.py DICT_FILE = "wn-dict.txt" INDEX_FILE = "wn-words-index.pic" WORDS_FILE = "wn-words.pic" dictPath = os.path.join(outDir, DICT_FILE) indexPath = os.path.join(outDir, INDEX_FILE) wordsPath = os.path.join(outDir, WORDS_FILE) if arsutils.fFileExists(dictPath) and arsutils.fFileExists(indexPath) and arsutils.fFileExists(wordsPath): print "All files already exist. Nothing to do" return for fileName in dataFiles: print "groking file: %s" % fileName grok_one_data_file(fileName) if len(unparsed) > 0: print "Unparsed:", len(unparsed) for i in unparsed: print i all_words = {} print "generating word defs" for synset in proSynsets: word_def = synset.getTxtMarked() for w in synset.words: if all_words.has_key(w): # TODO: some better sorting of this data? all_words[w] = all_words[w] + word_def else: all_words[w] = word_def print "sorting words" sortedWords = all_words.keys() sortedWords.sort() print "words: %d" % len(sortedWords) print "writing %s" % dictPath dictDataFo = open(dictPath, "wb") wordIndex = {} curOffset = 0 for word in sortedWords: wordDef = all_words[word] defLen = len(wordDef) wordIndex[word] = (curOffset, defLen) curOffset += defLen dictDataFo.write(wordDef) dictDataFo.close() print "pickling %s" % indexPath dictIndexFo = open(indexPath, "wb") cPickle.dump(wordIndex, dictIndexFo, protocol=cPickle.HIGHEST_PROTOCOL) dictIndexFo.close() print "pickling %s" % wordsPath dictWordsFo = open(wordsPath, "wb") cPickle.dump(sortedWords, dictWordsFo, protocol=cPickle.HIGHEST_PROTOCOL) dictWordsFo.close()
def fCacheExists(sqlDumpFileName): txtName = getBodyFileName(sqlDumpFileName) idxFileName = getIdxFileName(sqlDumpFileName) if arsutils.fFileExists(txtName) and arsutils.fFileExists(idxFileName): return True return False