def Categorize(source_dirs, output_dir, num_of_tags): if not IsDirectory(output_dir): print "output_dir not exists or not a directory" sys.exit(3) source_sets = source_dirs # Set(source_dirs) # will disrupt the original order news_sets = [] # a list of sets for source_dir in source_sets: if not IsDirectory(source_dir): print "Error. source_dir: " + source_dir + " not exists or not a directory" continue else: tmp_set = Set([]) for parent, dir_names, file_names in os.walk(source_dir): pattern = re.match(r'.*/([0-9]{8})[/]{,1}$', parent) if pattern != None and os.stat( parent ).st_mtime < last_mtime: #pattern.group(1) < OldestDir: print 'Directory: ' + parent + ' is too old, skipped' continue for file_name in file_names: if file_name[-4:] == 'json' and os.stat( parent + '/' + file_name).st_mtime >= last_mtime: tmp_set.add(parent + '/' + file_name) news_sets.append(tmp_set) for i in range(0, len(news_sets) - 1): for file_path in news_sets[i]: file_name = re.match(r".*/([-\w]+\.json)", file_path).group(1) print "---------------------------------" print "Searching for " + file_name + "'s similar passages ..." tags = ExtractTagsFromFile(file_path, num_of_tags) example_tf = GetTermFreqFromFile(tags, file_path) # a dict if example_tf == None: continue for j in range(i + 1, len(news_sets)): resfile = FindSimilarPassageFromSet(news_sets[j], example_tf) if resfile == None: #print "No similar passage to " + file_name + " in " continue else: f = codecs.open(file_path, 'r', 'utf-8') js = json.load(f) date = js['date'] newsId = js['newsId'] f.close() result_path = output_dir + '/' + date + '/' + newsId + '/' MakeDirectory(result_path) if not os.path.exists(result_path + '/' + file_name): CopyFile(file_path, result_path) CopyFile(resfile, result_path) print "found similar passage to " + file_name + ": " + resfile
def FindSimilarPassageFromDirectory(source_dir, example_tf): if not IsDirectory(source_dir): print "In findsimilarpassagefromdirectory: " + source_dir + " not exists or not a directory" return None heap = [] tags = [] for tag in example_tf.keys(): tags.append(tag) for parent, dir_names, file_names in os.walk(source_dir): for file_name in file_names: if file_name[-4:] == 'json': file_path = parent + '/' + file_name tf = GetTermFreqFromFile(tags, file_path) if tf == None: continue similarity = CosinSimilarityForDict(example_tf, tf) if not similarity == None: heap.append(SimilarPassage(similarity * -1.0, file_path)) heapq.heapify(heap) if len(heap) == 0: return None result = heapq.heappop(heap) if result.Relevant(): print "Similarity: " + str(result.similarity) return result.file_path else: return None
def view_entry(date, dir_name): dir_path = os.path.join(NEWSDIR, date, dir_name) if not IsDirectory(dir_path): return redirect(url_for('error_page', errcode='No this directory')) # get news contents and news comments news = {} comments = [] for file_name in os.listdir(dir_path): file_path = os.path.join(dir_path, file_name) if IsFile(file_path) and file_name[-4:] == 'json': f = codecs.open(file_path, 'r', 'utf-8') js = json.load(f) f.close() news[js['source']] = js comments.extend(GetComments(js)) # sort the comments comments.sort(key=lambda x: x['time']) comment_abstract = GetPassageAbstract( '\n'.join([comment['content'] for comment in comments]), 0.5, 0.1, '|') #comment_abstract.encode('utf-8') return render_template('view_news.html', news=news, comments=comments, comment_abstract=comment_abstract)
def get_news(self): current_path = os.path.join(self.parent_dir, self.date) for news_dir in os.listdir(current_path): if IsDirectory(os.path.join(current_path,news_dir)): self.news.append(News(current_path, news_dir)) self.news[-1].init_object() # print os.path.join(current_path, news_dir) self.news.sort(reverse = True)
def ExtractTagsFromDirectory(dir_path, num_of_tags): if not IsDirectory(dir_path): print("Path not exists or not a directory") sys.exit(3) for parent, dir_names, file_names in os.walk(dir_path): for file_name in file_names: if file_name[-4:] == 'json': file_path = parent + '/' + file_name tags = ExtractTagsFromFile(file_path, num_of_tags) WriteTagsToFile(file_path[:-5] + '.tags', tags, 'utf-8')
def get_entries(timespan=5): if (not hasattr(g, 'entries')) or g.magicnumber != get_magic_number(): g.magicnumber = get_magic_number() print('Update time: ' + g.magicnumber) g.entries = [] for dir_name in os.listdir(NEWSDIR): if IsDirectory(os.path.join(NEWSDIR, dir_name)): g.entries.append(DateDir(NEWSDIR, dir_name)) g.entries[-1].get_news() g.entries.sort(reverse=True) return g.entries
def Categorize(source_dirs, output_dir, num_of_tags): if not IsDirectory(output_dir): print("output_dir not exists or not a directory") sys.exit(3) source_sets = source_dirs # Set(source_dirs) # will disrupt the original order # 1. 从给定路径获取所有的新闻json地址,存入news_sets=[{tencent1,tencent2},{netease1,netease2},{sina1,sina2}]中 news_sets = [] # a list of sets for source_dir in source_sets: if not IsDirectory(source_dir): print("Error. source_dir: " + source_dir + " not exists or not a directory") continue else: tmp_set = set([]) for parent, dir_names, file_names in os.walk( source_dir ): # os.walk遍历该目录,返回三元组{当前文件夹路径,当前文件夹中文件夹,当前文件夹中文件} pattern = re.match(r'.*/([0-9]{8})[/]{,1}$', parent) if pattern != None and os.stat( parent ).st_mtime < last_mtime: # 文件上次修改时间<系统上次启动时间,则说明已更新过 print('Directory: ' + parent + ' is too old, skipped') continue for file_name in file_names: if file_name[-4:] == 'json' and os.stat( parent + '/' + file_name).st_mtime >= last_mtime: tmp_set.add(parent + '/' + file_name) news_sets.append(tmp_set) # 2. 找到所有文本的相似文本,有的话就存放到/result for i in range(0, len(news_sets) - 1): for file_path in news_sets[i]: file_name = re.match(r".*/([-\w]+\.json)", file_path).group(1) # 提取出文件名,'025453.json' print("---------------------------------") print("Searching for " + file_name + "'s similar passages ...") tags = ExtractTagsFromFile( file_path, num_of_tags) # 子函数1: 传入文件和词数,返回前几个关键词tags=['','',''] example_tf = GetTermFreqFromFile( tags, file_path ) # 子函数2: 传入关键词和文件,用tfidf返回对应词与权重example_tf={'':12, '':8} if example_tf == None: continue for j in range(i + 1, len(news_sets)): resfile = FindSimilarPassageFromSet( news_sets[j], example_tf) # 子函数3: 从其他平台新闻集找到相似文章,返回相似文件路径 if resfile == None: #print "No similar passage to " + file_name + " in " continue else: f = open(file_path, 'r') js = json.load(f) date = js['date'] newsId = js['newsId'] f.close() result_path = output_dir + '/' + date + '/' + newsId + '/' MakeDirectory(result_path) if not os.path.exists(result_path + '/' + file_name): CopyFile(file_path, result_path) CopyFile(resfile, result_path) print("found similar passage to " + file_name + ": " + resfile)