def build_model(cache=True): if cache: f = "%s/word2vec.model" % cache_dir() if os.path.isfile(f): return Word2Vec.load(f) texts = [] for url in crawl_report_list(): html = get(url) enc, time, title, text = ce.parse(url, html) sentences = text_util.get_sentences(text) for s in sentences: texts.append([w for w in jieba.cut(s)]) b = Word2Vec(texts) if cache: b.save(f) return b
def tf(cache=True, force=False): f = "%s/tf.txt" % cache_dir() if cache and not force: if os.path.isfile(f): return True d = defaultdict(int) for url in (crawl_report_list() + crawl_plan_list()): html = get(url) enc, time, title, text = ce.parse(url, html) sentences = text_util.get_sentences(text) for s in sentences: for w in jieba.cut(s): d[w] += 1 r = sorted(d.items(), key=lambda x:x[1], reverse=True) if cache: out = open(f, "w") for k,v in r: out.write(("%s\t%s\n" % (k,v)).encode('utf-8', 'ignore')) out.close() return True
#encoding: utf-8 import sys import content_extract as ce sys.path.append("../../lib") import download if __name__ == "__main__": html = download.getPage(sys.argv[1]) enc, time, title, text = ce.parse(sys.argv[1], html) print "标题:" + title.encode('utf-8', 'ignore') print "时间:" + time.encode('utf-8', 'ignore') print '=' * 10 print "内容:" + text.encode('utf-8', 'ignore')