def cal_q_d_log(self, q, dc=None, d=None, clean=False, content_field='content'): ''' :param q: 这个q可以是string句子,可以是wordlist,这个我来处理, :param dc: :param d: d是{}类型,d[content_field]表示的是内容,这个内容是做过了基本处理的内容 :param clean: :param content_field: :return:内容是P(w|d) ''' if type(q) != list: q = basic_preprocess(q) if type(d[content_field]) != list: d[content_field] = basic_preprocess(d[content_field][0]) if clean: q = clean_tools(q) if dc is None and d is not None: if clean: d[content_field] = clean_tools(d[content_field]) dc = defaultdict(int) for w in d[content_field]: dc[w] += 1.0 # print '[DEBUG] dc:', dc['US'], dc ret = 1.0 for w in q: ret += math.log((dc[w] + self.lmd * self.C[w]) / float( len(d[content_field]) + self.lmd )) #self.lmd * self.C[w] + (1-self.lmd) * dc[w] return ret
def test_2(): q = 'US Military Crisis Response' q = cut_words(q) d1 = sample_doc # 'US Military Crisis Response US Military Crisis Response US Military Crisis Response' d1 = basic_preprocess(d1) d2 = { 'score': 9.644032, 'key': 'ebola-1bbd62fe484a96be675ab80a304f0320a742a2da67f696cde413aee99e9f9349', 'content': d1 } lm = LMDirichlet() dc = lm.cal_dc(d1) lm.load(LMDirichlet_Json) print 'lm.C[US]:', lm.C['US'] p1 = lm.cal_w_d('US', d1) p2 = lm.cal_q_d(q, d=d2) p3 = lm.cal_q_d_log(q, d=d2) p4 = lm.cal_q_d(q, d=d2, dc=dc) print "p w,d:", p1, math.log(p1) print "p q,d:", p2 print "p q,d:", p3 print "p d d by dc:", p4
def test_3(): from data_utils import basic_preprocess q = 'US Military Crisis Response' q = cut_words(q) d1 = sample_doc # 'US Military Crisis Response US Military Crisis Response US Military Crisis Response' q_list = basic_preprocess(q) d1 = basic_preprocess(d1) d2 = { 'score': 9.644032, 'key': 'ebola-1bbd62fe484a96be675ab80a304f0320a742a2da67f696cde413aee99e9f9349', 'content': d1 } lm = LMDirichlet() dc = lm.cal_dc(d1) #LMDirichlet_without_stem lm.load(LMDirichlet_Json) # print 'lm.C[US]:', lm.C['US'] # print d1 p1 = lm.cal_w_d('US', d1) p2 = lm.cal_q_d(q, d=d2) p3 = lm.cal_q_d_log(q, d=d2) p4 = lm.cal_q_d(q, d=d2, dc=dc) print "p w,d:", p1, math.log(p1) print "p q,d:", p2 print "p q,d:", p3 print "p d d by dc:", p4 print "====================" print "q list:", q_list lm.load(LMDirichlet_without_stem) # print 'lm.C[US]:', lm.C['US'] p1 = lm.cal_w_d(q_list[0], d1) p2 = lm.cal_q_d(q, d=d2) p3 = lm.cal_q_d_log(q, d=d2) p4 = lm.cal_q_d(q, d=d2, dc=dc) print "p w,d:", p1, math.log(p1) print "p q,d:", p2 print "p q,d:", p3 print "p d d by dc:", p4
def cal_q_d(self, q, dc=None, d=None, clean=False, content_field='content'): ''' :param q: 必须是word list :param dc: :param d: d是{}类型,d[content_field]表示的是内容,这个内容是做过了基本处理的内容 :param clean: :param content_field: :return:内容是P(w|d) ''' # print "QUERY:", q if type(q) != list: print "before process q:", q q = basic_preprocess(q) print "after process q:", q if type(d[content_field]) != list: d[content_field] = basic_preprocess(d[content_field][0]) if clean: q = clean_tools(q) if dc is None and d is not None: if clean: d[content_field] = clean_tools(d[content_field]) dc = defaultdict(int) for w in d[content_field]: dc[w] += 1.0 # print 'CHECKING..., doc len:', len(d[content_field]) # for w in q: # print "check w cnt:", w, dc[w] # print '[DEBUG] dc:', dc['US'], dc ret = 1.0 for w in q: ret *= (dc[w] + self.lmd * self.C[w]) / ( float(len(d[content_field]) + self.lmd) # * self.C[w] # * len(d[content_field]) ) # ret *= (dc[w] + self.lmd * self.C[w]) / ( # float(len(d[content_field]) + self.lmd) # ) #self.lmd * self.C[w] + (1-self.lmd) * dc[w] return ret
def deal_nytimes(file_id, in_dir, out_dir, json_dir, stem_dir, stem_jsdir, overwrite=False): def get_content(blocks): try: max_len = max([len(b) for b in blocks]) main = [b for b in blocks if len(b) == max_len][0].strip() blocks = [b for b in blocks if main.startswith(b.strip())] return blocks + [main] except Exception as e: logging.exception("[!][%s] Exception: %s", file_id, e) return [] def write_line_data(file_name, key, words): with codecs.open(file_name, "w", "utf-8") as fl: fl.write("{} ".format(key)) fl.write(",".join(words)) fl.write("\n") in_file = in_dir.format(file_id) out_file = out_dir.format(file_id) json_file = json_dir.format(file_id) stem_file = stem_dir.format(file_id) stem_json = stem_jsdir.format(file_id) if exists(out_file) and exists(stem_file) and not overwrite: return False logging.info("[#] dealing file: %s", in_file) js = json.load(codecs.open(in_file, "r", "utf-8")) text = " ".join([js["title"]] + get_content(js["content"].values())) words = du.basic_preprocess(text) if not exists(out_file) or overwrite: write_line_data(out_file, js["doc_id"], words) stems = du.stemmer_by_porter(words) if not exists(stem_file) or overwrite: write_line_data(stem_file, js["doc_id"], stems) doc_dict = {"words": Counter(words), "id": file_id, "key": js["doc_id"]} if not exists(json_file) or overwrite: json.dump(doc_dict, codecs.open(json_file, "w", "utf-8")) if not exists(stem_json) or overwrite: doc_dict["words"] = Counter(stems) json.dump(doc_dict, codecs.open(stem_json, "w", "utf-8")) return True
def deal_ebola(file_id, in_dir, out_dir, json_dir, stem_dir, stem_jsdir, overwrite=True): def write_line_data(file_name, key, words): with codecs.open(file_name, "w", "utf-8") as fl: fl.write("{} ".format(key)) fl.write(",".join(words)) fl.write("\n") in_file = in_dir.format(file_id) out_file = out_dir.format(file_id) json_file = json_dir.format(file_id) stem_file = stem_dir.format(file_id) stem_json = stem_jsdir.format(file_id) logging.info("[#] deal file_id: %s", file_id) if (exists(out_file) and not overwrite and exists(stem_file) and exists(json_file) and exists(stem_json)): return False logging.info("[#] dealing file_id: %s", file_id) js = json.load(codecs.open(in_file, "r", "utf-8")) text = parse_html(js["content"]) words = du.basic_preprocess(text, length_limit=1) doc_dict = {"words": Counter(words), "id": file_id, "key": js["key"]} stem_words = du.stemmer_by_porter(words) if not exists(json_file) or overwrite: json.dump(doc_dict, codecs.open(json_file, "w", "utf-8")) if not exists(out_file) or overwrite: write_line_data(out_file, js["key"], words) if not exists(stem_file) or overwrite: write_line_data(stem_file, js["key"], stem_words) if not exists(stem_json) or overwrite: doc_dict["words"] = Counter(stem_words) json.dump(doc_dict, codecs.open(stem_json, "w", "utf-8")) return True