def add_news(self, newsList, ix=None): if ix is None: ix = open_dir(self.idxDir, schema=self.schema, indexname=self.idxName) with ix.writer() as writer: for news in show_status(newsList, "Add documents to %s" % self.idxName): writer.add_document(**news) logger.info("Committing ...")
def deep_compress(self, imgType=None, input_dir=None, output_dir=None, quality=84, overwrite=False): if imgType == "sm": # 不要用这个压缩小图,否则容易不清楚! input_dir, output_dir, quality = Pre_Sm_Cover_Dir, Sm_Cover_Dir, 95 elif imgType == "bg": input_dir, output_dir, quality = Pre_Bg_Cover_Dir, Bg_Cover_Dir, 85 elif imgType is None: pass else: raise ValueError("unexpected image type '%s' !" % imgType) for file in show_status(sorted(os.listdir(input_dir))): inImg = os.path.join(input_dir, file) outImg = os.path.join(output_dir, file) if os.path.exists(outImg) and not overwrite: continue else: # print('guetzli --quality %d --nomemlimit %s %s' % (quality, inImg, outImg)) code = os.system('guetzli --quality %d --nomemlimit %s %s' % (quality, inImg, outImg)) if code == signal.SIGINT: break
def get_occur_freq(self): for newsID, words in show_status(self.wordFrags.items(),"get occurrence frequency"): for word in set(words): if word in self.details: self.details[word]["occur_freq"] = self.details[word].get("occur_freq",0) + 1 for word, details in list(self.details.items()): if details["occur_freq"] == 1: # 去掉只出现一次的词 self.details.pop(word)
def get_fragments(self, fromCache=False): if fromCache: return else: with SQLiteDB() as newsDB: newsContents = newsDB.select("newsContent",("newsID","content")).fetchall() newsContents = [news for news in newsContents if news["newsID"] not in self.discard_newsIDs] fragments = {news["newsID"]: self.lcut(news["content"]) for news in show_status(newsContents, "cut news")} pkl_dump(self.Fragments_File, fragments)
def get_detail_sum(self): detailFreq = {key: [] for key in self.highFreqWords} for newsID, words in show_status(self.wordFrags.items(),"get detail frequency"): wordsFreq = {key:0 for key in self.highFreqWords} for word in words: if word in self.highFreqWords: wordsFreq[word] = wordsFreq.get(word,0) + 1 for word, freq in wordsFreq.items(): detailFreq[word].append(freq) return detailFreq
def get_bins(self): keyWords = {newsID: self.extract(words) for newsID, words in self.fragments.items()} pkl_dump(self.Key_Words_File, keyWords) uniqueKeyWords = list(set(iter_flat(keyWords.values()))) pkl_dump(self.Key_Words_List_File, uniqueKeyWords) bins = {newsID:np.array([(word in words) for word in uniqueKeyWords]) for newsID, words in show_status(keyWords.items(), "get bins")} pkl_dump(self.Bins_File, bins)
def get_total_freq(self): for newsID, words in show_status(self.wordFrags.items(),"get total frequency"): for word in words: if word not in self.details: self.details[word] = dict() self.details[word]["total_freq"] = self.details[word].get("total_freq",0) + 1 for word, details in list(self.details.items()): if word in string.punctuation: self.details.pop(word) elif not isChinese(word): self.details.pop(word)
def get_binarization(self): binarize = dict() wordsList = [line["word"] for line in read_csv("word_frequency.csv")] wordsSet = frozenset(wordsList) for newsID, words in show_status(self.wordFrags.items(),"get binarization"): binarize[newsID] = {key: 0 for key in wordsList} for word in words: if word in wordsSet: binarize[newsID][word] = 1 binarize = {newsID: np.array([freqs[word] for word in wordsList]) for newsID,freqs in binarize.items()} pkl_dump("wordsList.pkl",wordsList) pkl_dump("binarize.pkl",binarize)
def batchget_newsInfo(self, begin=0): def begin_generator(): begin = 0 while True: yield begin begin += 7 totalNewsInfo = [] for begin in show_status(begin_generator(),"Getting newsInfo ..."): newsInfo = self.get_newsInfo(begin) if newsInfo != []: totalNewsInfo.extend(newsInfo) else: break return totalNewsInfo
def download_covers(self): with SQLiteDB() as newsDB: newsInfo = newsDB.select("newsInfo", ("newsID", "cover")).fetchall() if self.new is not None: newsInfo = [ news for news in newsInfo if news["newsID"] in self.new ] for news in show_status(newsInfo): newsID, url = news['newsID'], news['cover'] resp = requests.get(url) ext = resp.headers.get('Content-Type').split('/')[1] file = "%s.%s" % (newsID, ext) with open(os.path.join(Origin_Cover_Dir, file), 'wb') as fp: fp.write(resp.content)
def _cut_words(self, fromCache=True): if fromCache: wordFrags = pkl_load("wordFrags.pkl") else: wordFragsList = list() with DataBase() as db: newsID, newsData = db.get_news() jieba.enable_parallel(4) for news in show_status(newsData,"cut words"): frags = jieba.cut(news, cut_all=False) words = [frag for frag in frags if (frag not in self.stopWords) \ and (not frag.isspace() and (not frag.isdigit()))] wordFragsList.append(words) jieba.disable_parallel() wordFrags = dict(zip(newsID, wordFragsList)) pkl_dump("wordFrags.pkl") return wordFrags
def detail_analyse(self): self.get_total_freq() self.get_occur_freq() self.highFreqWords = self.get_highFreqWords(top=1024*16) self.details = {key:value for key,value in self.details.items() if key in self.highFreqWords} self.detailFreq = self.get_detail_sum() for word, freq in show_status(self.detailFreq.items(),"analyse detail"): freqArray = np.array(freq) mean = np.mean(freqArray) std = np.std(freqArray) self.details[word].update({ "word": word, "std": std, "cv": std/mean, })
def plt_show(): binarize = pkl_load("binarize.pkl") results = list() for newsID_x, newsBin_x in show_status(binarize.items()): for newsID_y, newsBin_y in binarize.items(): if newsID_x > newsID_y: # 减少一半运算量 continue else: dot = np.dot(newsBin_x, newsBin_y) Tc = dot / (np.sum(newsBin_x) + np.sum(newsBin_y) - dot) #results.append((newsID_x,newsID_y,Tc)) results.append(Tc) #results.sort(lambda item: item[2]) results.sort(reverse=True) plt.plot(np.array(results)) plt.show()
def to_jpeg(self, input_dir=Origin_Cover_Dir, output_dir=Origin_JPEG_Cover_Dir, overwrite=False): for imgPath in show_status(os.listdir(input_dir)): img = Image.open(os.path.join(input_dir, imgPath)) if img.format.lower() != "jpeg": # png/gif outputPath = os.path.join( output_dir, os.path.splitext(imgPath)[0] + '.jpeg') if os.path.exists(outputPath) and not overwrite: continue else: img.convert('RGB').save(outputPath) else: outputPath = os.path.join(output_dir, imgPath) if os.path.exists(outputPath) and not overwrite: continue else: img.convert('RGB').save(outputPath)
def cv_compress_sm(self, input_dir=Origin_JPEG_Cover_Dir, output_dir=Sm_Cover_Dir, min_size=Sm_Cover_Default_Min_Size, overwrite=False): for imgPath in show_status(os.listdir(input_dir)): if imgPath in os.listdir(output_dir) and not overwrite: continue else: originImg = cv2.imread(os.path.join(input_dir, imgPath)) origin_h, origin_w = originImg.shape[:2] if origin_h >= origin_w: new_w = int(min_size) new_h = int(new_w / origin_w * origin_h) else: new_h = int(min_size) new_w = int(new_h / origin_h * origin_w) newImg = cv2.resize(originImg, (new_w, new_h), interpolation=CV_Default_Interpolation) cv2.imwrite(os.path.join(output_dir, imgPath), newImg) #, [cv2.IMWRITE_JPEG_QUALITY, 85])
def cv_compress_bg(self, input_dir=Origin_JPEG_Cover_Dir, output_dir=Pre_Bg_Cover_Dir, width=Bg_Cover_Default_Width, overwrite=False): for imgPath in show_status(os.listdir(input_dir)): if imgPath in os.listdir(output_dir) and not overwrite: continue else: inImg = os.path.join(input_dir, imgPath) originImg = cv2.imread(inImg) origin_h, origin_w = originImg.shape[:2] new_w = int(width) new_h = int(new_w / origin_w * origin_h) if new_w >= origin_w: self.copy_origin(imgPath, input_dir, output_dir) else: newImg = cv2.resize(originImg, (new_w, new_h), interpolation=CV_Default_Interpolation) outImg = os.path.join(output_dir, imgPath) cv2.imwrite(outImg, newImg) #, [cv2.IMWRITE_JPEG_QUALITY, 85]) if os.path.getsize(outImg) >= os.path.getsize(inImg): self.copy_origin(imgPath, input_dir, output_dir)
def batchget_newsContent(self, newsInfos): for news in show_status(newsInfos,"Getting newsContent ..."): news.update(self.get_newsContent(news)) news.pop("content_url") return newsInfos
def filter_results(self): for word, details in show_status(list(self.details.items()),"filter results"): if details["cv"] <= 12: self.details.pop(word)