Esempio n. 1
0
	def add_news(self, newsList, ix=None):
		if ix is None:
			ix = open_dir(self.idxDir, schema=self.schema, indexname=self.idxName)
		with ix.writer() as writer:
			for news in show_status(newsList, "Add documents to %s" % self.idxName):
				writer.add_document(**news)
			logger.info("Committing ...")
Esempio n. 2
0
    def deep_compress(self,
                      imgType=None,
                      input_dir=None,
                      output_dir=None,
                      quality=84,
                      overwrite=False):
        if imgType == "sm":  # 不要用这个压缩小图,否则容易不清楚!
            input_dir, output_dir, quality = Pre_Sm_Cover_Dir, Sm_Cover_Dir, 95
        elif imgType == "bg":
            input_dir, output_dir, quality = Pre_Bg_Cover_Dir, Bg_Cover_Dir, 85
        elif imgType is None:
            pass
        else:
            raise ValueError("unexpected image type '%s' !" % imgType)

        for file in show_status(sorted(os.listdir(input_dir))):
            inImg = os.path.join(input_dir, file)
            outImg = os.path.join(output_dir, file)
            if os.path.exists(outImg) and not overwrite:
                continue
            else:
                # print('guetzli --quality %d --nomemlimit %s %s' % (quality, inImg, outImg))
                code = os.system('guetzli --quality %d --nomemlimit %s %s' %
                                 (quality, inImg, outImg))
                if code == signal.SIGINT:
                    break
Esempio n. 3
0
	def get_occur_freq(self):
		for newsID, words in show_status(self.wordFrags.items(),"get occurrence frequency"):
			for word in set(words):
				if word in self.details:
					self.details[word]["occur_freq"] = self.details[word].get("occur_freq",0) + 1
		for word, details in list(self.details.items()):
			if details["occur_freq"] == 1: # 去掉只出现一次的词
				self.details.pop(word)
Esempio n. 4
0
	def get_fragments(self, fromCache=False):
		if fromCache:
			return
		else:
			with SQLiteDB() as newsDB:
				newsContents = newsDB.select("newsContent",("newsID","content")).fetchall()
				newsContents = [news for news in newsContents if news["newsID"] not in self.discard_newsIDs]
			fragments = {news["newsID"]: self.lcut(news["content"]) for news in show_status(newsContents, "cut news")}
			pkl_dump(self.Fragments_File, fragments)
Esempio n. 5
0
	def get_detail_sum(self):
		detailFreq = {key: [] for key in self.highFreqWords}
		for newsID, words in show_status(self.wordFrags.items(),"get detail frequency"):
			wordsFreq = {key:0 for key in self.highFreqWords}
			for word in words:
				if word in self.highFreqWords:
					wordsFreq[word] = wordsFreq.get(word,0) + 1
			for word, freq in wordsFreq.items():
				detailFreq[word].append(freq)
		return detailFreq
Esempio n. 6
0
	def get_bins(self):
		keyWords = {newsID: self.extract(words)	for newsID, words in self.fragments.items()}
		pkl_dump(self.Key_Words_File, keyWords)

		uniqueKeyWords = list(set(iter_flat(keyWords.values())))
		pkl_dump(self.Key_Words_List_File, uniqueKeyWords)

		bins = {newsID:np.array([(word in words) for word in uniqueKeyWords])
			for newsID, words in show_status(keyWords.items(), "get bins")}

		pkl_dump(self.Bins_File, bins)
Esempio n. 7
0
	def get_total_freq(self):
		for newsID, words in show_status(self.wordFrags.items(),"get total frequency"):
			for word in words:
				if word not in self.details:
					self.details[word] = dict()
				self.details[word]["total_freq"] = self.details[word].get("total_freq",0) + 1
		for word, details in list(self.details.items()):
			if word in string.punctuation:
				self.details.pop(word)
			elif not isChinese(word):
				self.details.pop(word)
Esempio n. 8
0
	def get_binarization(self):
		binarize = dict()
		wordsList = [line["word"] for line in read_csv("word_frequency.csv")]
		wordsSet = frozenset(wordsList)
		for newsID, words in show_status(self.wordFrags.items(),"get binarization"):
			binarize[newsID] = {key: 0 for key in wordsList}
			for word in words:
				if word in wordsSet:
					binarize[newsID][word] = 1
		binarize = {newsID: np.array([freqs[word] for word in wordsList]) for newsID,freqs in binarize.items()}
		pkl_dump("wordsList.pkl",wordsList)
		pkl_dump("binarize.pkl",binarize)
Esempio n. 9
0
	def batchget_newsInfo(self, begin=0):
		def begin_generator():
			begin = 0
			while True:
				yield begin
				begin += 7
		totalNewsInfo = []
		for begin in show_status(begin_generator(),"Getting newsInfo ..."):
			newsInfo = self.get_newsInfo(begin)
			if newsInfo != []:
				totalNewsInfo.extend(newsInfo)
			else:
				break
		return totalNewsInfo
Esempio n. 10
0
 def download_covers(self):
     with SQLiteDB() as newsDB:
         newsInfo = newsDB.select("newsInfo",
                                  ("newsID", "cover")).fetchall()
         if self.new is not None:
             newsInfo = [
                 news for news in newsInfo if news["newsID"] in self.new
             ]
     for news in show_status(newsInfo):
         newsID, url = news['newsID'], news['cover']
         resp = requests.get(url)
         ext = resp.headers.get('Content-Type').split('/')[1]
         file = "%s.%s" % (newsID, ext)
         with open(os.path.join(Origin_Cover_Dir, file), 'wb') as fp:
             fp.write(resp.content)
Esempio n. 11
0
	def _cut_words(self, fromCache=True):
		if fromCache:
			wordFrags = pkl_load("wordFrags.pkl")
		else:
			wordFragsList = list()
			with DataBase() as db:
				newsID, newsData = db.get_news()
			jieba.enable_parallel(4)
			for news in show_status(newsData,"cut words"):
				frags = jieba.cut(news, cut_all=False)
				words = [frag for frag in frags if (frag not in self.stopWords) \
							and (not frag.isspace() and (not frag.isdigit()))]
				wordFragsList.append(words)
			jieba.disable_parallel()
			wordFrags = dict(zip(newsID, wordFragsList))
			pkl_dump("wordFrags.pkl")
		return wordFrags
Esempio n. 12
0
	def detail_analyse(self):
		self.get_total_freq()
		self.get_occur_freq()
		self.highFreqWords = self.get_highFreqWords(top=1024*16)
		self.details = {key:value for key,value in self.details.items() if key in self.highFreqWords}
		self.detailFreq = self.get_detail_sum()

		for word, freq in show_status(self.detailFreq.items(),"analyse detail"):
			freqArray = np.array(freq)
			mean = np.mean(freqArray)
			std = np.std(freqArray)

			self.details[word].update({
				"word": word,
				"std": std,
				"cv": std/mean,
			})
Esempio n. 13
0
def plt_show():
	binarize = pkl_load("binarize.pkl")

	results = list()
	for newsID_x, newsBin_x in show_status(binarize.items()):
		for newsID_y, newsBin_y in binarize.items():
			if newsID_x > newsID_y: # 减少一半运算量
				continue
			else:
				dot = np.dot(newsBin_x, newsBin_y)
				Tc = dot / (np.sum(newsBin_x) + np.sum(newsBin_y) - dot)
				#results.append((newsID_x,newsID_y,Tc))
				results.append(Tc)

	#results.sort(lambda item: item[2])
	results.sort(reverse=True)

	plt.plot(np.array(results))
	plt.show()
Esempio n. 14
0
 def to_jpeg(self,
             input_dir=Origin_Cover_Dir,
             output_dir=Origin_JPEG_Cover_Dir,
             overwrite=False):
     for imgPath in show_status(os.listdir(input_dir)):
         img = Image.open(os.path.join(input_dir, imgPath))
         if img.format.lower() != "jpeg":  # png/gif
             outputPath = os.path.join(
                 output_dir,
                 os.path.splitext(imgPath)[0] + '.jpeg')
             if os.path.exists(outputPath) and not overwrite:
                 continue
             else:
                 img.convert('RGB').save(outputPath)
         else:
             outputPath = os.path.join(output_dir, imgPath)
             if os.path.exists(outputPath) and not overwrite:
                 continue
             else:
                 img.convert('RGB').save(outputPath)
Esempio n. 15
0
 def cv_compress_sm(self,
                    input_dir=Origin_JPEG_Cover_Dir,
                    output_dir=Sm_Cover_Dir,
                    min_size=Sm_Cover_Default_Min_Size,
                    overwrite=False):
     for imgPath in show_status(os.listdir(input_dir)):
         if imgPath in os.listdir(output_dir) and not overwrite:
             continue
         else:
             originImg = cv2.imread(os.path.join(input_dir, imgPath))
             origin_h, origin_w = originImg.shape[:2]
             if origin_h >= origin_w:
                 new_w = int(min_size)
                 new_h = int(new_w / origin_w * origin_h)
             else:
                 new_h = int(min_size)
                 new_w = int(new_h / origin_h * origin_w)
             newImg = cv2.resize(originImg, (new_w, new_h),
                                 interpolation=CV_Default_Interpolation)
             cv2.imwrite(os.path.join(output_dir, imgPath),
                         newImg)  #, [cv2.IMWRITE_JPEG_QUALITY, 85])
Esempio n. 16
0
 def cv_compress_bg(self,
                    input_dir=Origin_JPEG_Cover_Dir,
                    output_dir=Pre_Bg_Cover_Dir,
                    width=Bg_Cover_Default_Width,
                    overwrite=False):
     for imgPath in show_status(os.listdir(input_dir)):
         if imgPath in os.listdir(output_dir) and not overwrite:
             continue
         else:
             inImg = os.path.join(input_dir, imgPath)
             originImg = cv2.imread(inImg)
             origin_h, origin_w = originImg.shape[:2]
             new_w = int(width)
             new_h = int(new_w / origin_w * origin_h)
             if new_w >= origin_w:
                 self.copy_origin(imgPath, input_dir, output_dir)
             else:
                 newImg = cv2.resize(originImg, (new_w, new_h),
                                     interpolation=CV_Default_Interpolation)
                 outImg = os.path.join(output_dir, imgPath)
                 cv2.imwrite(outImg,
                             newImg)  #, [cv2.IMWRITE_JPEG_QUALITY, 85])
                 if os.path.getsize(outImg) >= os.path.getsize(inImg):
                     self.copy_origin(imgPath, input_dir, output_dir)
Esempio n. 17
0
	def batchget_newsContent(self, newsInfos):
		for news in show_status(newsInfos,"Getting newsContent ..."):
			news.update(self.get_newsContent(news))
			news.pop("content_url")
		return newsInfos
Esempio n. 18
0
	def filter_results(self):
		for word, details in show_status(list(self.details.items()),"filter results"):
			if details["cv"] <= 12:
				self.details.pop(word)