def __init__(self, db_dir): self.db_dir = db_dir # データベースの配置ディレクトリパス self.db_word = depot.open(os.path.join(db_dir, "Word.qdbm"), "c") # 単語 -> 単語id self.db_worddoc_tf = depot.open(os.path.join(db_dir, "WordDoc_TF.qdbm"), "c") # 単語id,文書id -> TF self.db_word_doctf = depot.open(os.path.join(db_dir, "Word_DocTF.qdbm"), "c") # 単語id -> 文書id:TF,... self.db_doc_wordtf = depot.open(os.path.join(db_dir, "Doc_WordTF.qdbm"), "c") # 文書id -> 単語id:TF,... self.db_word_stat = depot.open(os.path.join(db_dir, "WordStat.qdbm"), "c") # 単語id -> 単語,DF, sum(TF), SF # csv形式 self.db_wordpair_stat = depot.open(os.path.join(db_dir, "WordPairStat.qdbm"), "c") # 単語1id,単語2id -> DF, SF
#!/usr/bin/python # coding:utf-8 """ 2013/09/11 QDBMに文書コーパスの単語の統計情報を記録する """ from qdbm import depot import os db_word = depot.open("Word.qdbm", "n") # 単語 -> 単語id # db_tf = depot.open("TermFreq.db", "n") # 単語id,文書id -> TF db_word_stat = depot.open("WordStat.qdbm", "n") # 単語id -> 単語, DF, sum(TF), SF db_pair_stat = depot.open("WordPairStat.qdbm", "n") # 単語1id, 単語2id -> DF, SF 単語1id < 単語2id def index(doc): """ doc : [sent,...] sent : [word,...] """ word_set = set(word for sent in doc for word in sent) numWordRec = len(db_word) wordid_dict = dict() for word in word_set: wordid = db_word.setdefault(word, numWordRec) if wordid == numWordRec: numWordRec += 1 wordid_dict[word] = wordid
from qdbm import depot db = depot.open("test.db", "n") # depot.open(ファイル名, フラグ, バケット配列の要素数の目安) db["apple"] = "red" # key, valueで登録 db["lemon"] = "black" db["orange"] = "orange" db["lemon"] = "yellow" # 上書き登録 print db["lemon"] # keyがlemonのvalueを取得 print db.get("orange", "unknown") # default付きで取得 print db.get("melon", "unknown") # default付きで取得 print db.keys() # keyの一覧を取得 for k in db.iterkeys(): # keyのiteratorでぐるぐるまわす print k for k, v in db.iteritems(): # (key, value)のiteratorでぐるぐるまわす print k, v for v in db.itervalues(): # valueのiteratorでぐるぐるまわす print v db.close() # 行儀よく終了 ''' depot.open(ファイル名, [フラグ, [バケット配列の要素数の目安]])