class Corpus(): # ---------------- __init__() ---------------- def __init__(self, corpus_dir): self.lock_meta = Lock() self.open(corpus_dir) # ---------------- __del__() ---------------- def __del__(self): self.close() # ---------------- open_db_meta() ---------------- def open_db_meta(self): logging.debug(Logger.debug("Corpus open_db_meta() %s" % (self.meta_dir) )) db_meta = leveldb.LevelDB(self.meta_dir) return db_meta # ---------------- close_db_meta() ---------------- def close_db_meta(self, db_meta): db_meta = None def lock(self): self.lock_meta.acquire() def unlock(self): self.lock_meta.release() # ---------------- open() ---------------- def open(self, corpus_dir): self.root_dir = corpus_dir if not path.isdir(corpus_dir): os.mkdir(corpus_dir) self.meta_dir = self.root_dir + "/meta" self.samples_dir = self.root_dir + "/samples" if not path.isdir(self.samples_dir): os.mkdir(self.samples_dir) self.vocabulary_dir = self.root_dir + "/vocabulary" self.vocabulary = Vocabulary(self.vocabulary_dir) self.categories_dir = self.root_dir + "/categories" self.categories = Categories(self.categories_dir) self.categories.load_categories() self.categories.print_categories() # ---------------- close() ---------------- def close(self): pass # ---------------- acquire_sample_id() ---------------- # 线程安全方式获取num_samples个sample_id(全Corpus唯一)。 def acquire_sample_id(self, num_samples): self.lock() sample_id = self.get_sample_maxid() sample_maxid = sample_id + num_samples self.set_sample_maxid(sample_maxid) self.unlock() return sample_id def get_sample_maxid(self): sample_maxid = 0 db_meta = self.open_db_meta() try: str_maxid = db_meta.Get("__sample_maxid__") sample_maxid = int(str_maxid) except KeyError: db_meta.Put("__sample_maxid__", "0") self.close_db_meta(db_meta) return sample_maxid def set_sample_maxid(self, sample_maxid): db_meta = self.open_db_meta() db_meta.Put("__sample_maxid__", str(sample_maxid)) self.close_db_meta(db_meta) # ---------------- export_svm_file() ---------------- def export_svm_file(self, samples_name, svm_file): samples = Samples(self, samples_name) logging.debug(Logger.debug("Export svm file...")) tm_tfidf = samples.load_tfidf_matrix() save_term_matrix_as_svm_file(tm_tfidf, svm_file) # ---------------- transform_sensitive_terms() ---------------- def transform_sensitive_terms(self, sensitive_words, vocabulary): sensitive_terms = {} if not sensitive_words is None: for word in sensitive_words: w = sensitive_words[word] term_id = vocabulary.get_term_id(word) sensitive_terms[term_id] = w return sensitive_terms # ---------------- query_by_id() ---------------- def query_by_id(self, samples_positive, samples_unlabeled, sample_id): tsm_positive = samples_positive.tsm tsm_unlabeled = samples_unlabeled.tsm sensitive_words = { ##u"立案":3.0, ##u"获刑":3.0, ##u"受贿":3.0, ##u"有期徒刑":3.0, ##u"宣判":3.0, ##u"审计":2.0, ##u"调查":2.0 } sensitive_terms = self.transform_sensitive_terms(sensitive_words, self.vocabulary) try: sample_content = samples_unlabeled.db_content.Get(str(sample_id)) #(_, category, date, title, key, url, content) = msgpack.loads(sample_content) (_, category, date, title, key, url, msgext) = decode_sample_meta(sample_content) (version, content, (cat1, cat2, cat3)) = msgext print "sample id: %d" % (sample_id) print "category: %d" % (category) print "key: %s" % (key) print "url: %s" % (url) print "date: %s" % (date) print "title: %s" % (title) print "---------------- content ----------------" #print "%s" % (content) sample_terms, term_map = self.vocabulary.seg_content(content) print "sample_terms: %d terms_count: %d" % (sample_terms, len(term_map)) #for term_id in term_map: terms_list = sorted_dict_by_values(term_map, reverse=True) for (term_id, term_used_in_sample) in terms_list: term_text = self.vocabulary.get_term_text(term_id) #term_used_in_sample = term_map[term_id] print "%s(%d): %d" % (term_text, term_id, term_used_in_sample) except KeyError: print "Sample %d not found in db_content." % (sample_id) db_sm = samples_unlabeled.tsm.open_db_sm() try: str_sample_info = db_sm.Get(str(sample_id)) (category, sample_terms, term_map) = msgpack.loads(str_sample_info) print "" print "---------------- keywords ----------------" print "" terms = {} for term_id in term_map: term_text = self.vocabulary.get_term_text(term_id) term_used = term_map[term_id] (pd_word, speciality, popularity) = calculate_term_positive_degree(term_id, tsm_positive, tsm_unlabeled, sensitive_terms) terms[term_id] = (pd_word, speciality, popularity, term_used, term_text) terms_list = sorted_dict_by_values(terms, reverse = True) for (term_id, (pd_word, speciality, popularity, term_used, term_text)) in terms_list: print "%s\t%d\t[%.6f,%.6f,%.6f]\t(id:%d)" % (term_text, term_used, pd_word, speciality, popularity, term_id) except KeyError: print "Sample %d not found in db_sm." % (sample_id) samples_unlabeled.tsm.close_db(db_sm)