def Categorize(source_dirs, output_dir, num_of_tags):
    if not IsDirectory(output_dir):
        print "output_dir not exists or not a directory"
        sys.exit(3)

    source_sets = source_dirs  # Set(source_dirs) # will disrupt the original order

    news_sets = []  # a list of sets
    for source_dir in source_sets:
        if not IsDirectory(source_dir):
            print "Error. source_dir: " + source_dir + " not exists or not a directory"
            continue
        else:
            tmp_set = Set([])
            for parent, dir_names, file_names in os.walk(source_dir):
                pattern = re.match(r'.*/([0-9]{8})[/]{,1}$', parent)
                if pattern != None and os.stat(
                        parent
                ).st_mtime < last_mtime:  #pattern.group(1) < OldestDir:
                    print 'Directory: ' + parent + ' is too old, skipped'
                    continue
                for file_name in file_names:
                    if file_name[-4:] == 'json' and os.stat(
                            parent + '/' + file_name).st_mtime >= last_mtime:
                        tmp_set.add(parent + '/' + file_name)
            news_sets.append(tmp_set)

    for i in range(0, len(news_sets) - 1):
        for file_path in news_sets[i]:
            file_name = re.match(r".*/([-\w]+\.json)", file_path).group(1)
            print "---------------------------------"
            print "Searching for " + file_name + "'s similar passages ..."

            tags = ExtractTagsFromFile(file_path, num_of_tags)
            example_tf = GetTermFreqFromFile(tags, file_path)  # a dict
            if example_tf == None:
                continue
            for j in range(i + 1, len(news_sets)):
                resfile = FindSimilarPassageFromSet(news_sets[j], example_tf)
                if resfile == None:
                    #print "No similar passage to " + file_name + " in "
                    continue
                else:
                    f = codecs.open(file_path, 'r', 'utf-8')
                    js = json.load(f)
                    date = js['date']
                    newsId = js['newsId']
                    f.close()

                    result_path = output_dir + '/' + date + '/' + newsId + '/'
                    MakeDirectory(result_path)

                    if not os.path.exists(result_path + '/' + file_name):
                        CopyFile(file_path, result_path)
                    CopyFile(resfile, result_path)
                    print "found similar passage to " + file_name + ": " + resfile
Exemple #2
0
def FindSimilarPassageFromDirectory(source_dir, example_tf):
    if not IsDirectory(source_dir):
        print "In findsimilarpassagefromdirectory: " + source_dir + " not exists or not a directory"
        return None

    heap = []
    tags = []
    for tag in example_tf.keys():
        tags.append(tag)

    for parent, dir_names, file_names in os.walk(source_dir):
        for file_name in file_names:
            if file_name[-4:] == 'json':
                file_path = parent + '/' + file_name
                tf = GetTermFreqFromFile(tags, file_path)
                if tf == None:
                    continue
                similarity = CosinSimilarityForDict(example_tf, tf)
                if not similarity == None:
                    heap.append(SimilarPassage(similarity * -1.0, file_path))

    heapq.heapify(heap)
    if len(heap) == 0:
        return None
    result = heapq.heappop(heap)
    if result.Relevant():
        print "Similarity: " + str(result.similarity)
        return result.file_path
    else:
        return None
def view_entry(date, dir_name):
    dir_path = os.path.join(NEWSDIR, date, dir_name)
    if not IsDirectory(dir_path):
        return redirect(url_for('error_page', errcode='No this directory'))
    # get news contents and news comments
    news = {}
    comments = []
    for file_name in os.listdir(dir_path):
        file_path = os.path.join(dir_path, file_name)
        if IsFile(file_path) and file_name[-4:] == 'json':
            f = codecs.open(file_path, 'r', 'utf-8')
            js = json.load(f)
            f.close()
            news[js['source']] = js
            comments.extend(GetComments(js))

    # sort the comments
    comments.sort(key=lambda x: x['time'])
    comment_abstract = GetPassageAbstract(
        '\n'.join([comment['content'] for comment in comments]), 0.5, 0.1, '|')
    #comment_abstract.encode('utf-8')
    return render_template('view_news.html',
                           news=news,
                           comments=comments,
                           comment_abstract=comment_abstract)
    def get_news(self):
        current_path = os.path.join(self.parent_dir, self.date)
        for news_dir in os.listdir(current_path):
            if IsDirectory(os.path.join(current_path,news_dir)):
                self.news.append(News(current_path, news_dir))
                self.news[-1].init_object()
#                print os.path.join(current_path, news_dir)
        self.news.sort(reverse = True)
Exemple #5
0
def ExtractTagsFromDirectory(dir_path, num_of_tags):
    if not IsDirectory(dir_path):
        print("Path not exists or not a directory")
        sys.exit(3)
    for parent, dir_names, file_names in os.walk(dir_path):
        for file_name in file_names:
            if file_name[-4:] == 'json':
                file_path = parent + '/' + file_name
                tags = ExtractTagsFromFile(file_path, num_of_tags)
                WriteTagsToFile(file_path[:-5] + '.tags', tags, 'utf-8')
Exemple #6
0
def get_entries(timespan=5):
    if (not hasattr(g, 'entries')) or g.magicnumber != get_magic_number():
        g.magicnumber = get_magic_number()
        print('Update time: ' + g.magicnumber)
        g.entries = []
        for dir_name in os.listdir(NEWSDIR):
            if IsDirectory(os.path.join(NEWSDIR, dir_name)):
                g.entries.append(DateDir(NEWSDIR, dir_name))
                g.entries[-1].get_news()
        g.entries.sort(reverse=True)
    return g.entries
Exemple #7
0
def Categorize(source_dirs, output_dir, num_of_tags):
    if not IsDirectory(output_dir):
        print("output_dir not exists or not a directory")
        sys.exit(3)

    source_sets = source_dirs  # Set(source_dirs) # will disrupt the original order

    # 1. 从给定路径获取所有的新闻json地址,存入news_sets=[{tencent1,tencent2},{netease1,netease2},{sina1,sina2}]中
    news_sets = []  # a list of sets
    for source_dir in source_sets:
        if not IsDirectory(source_dir):
            print("Error. source_dir: " + source_dir +
                  " not exists or not a directory")
            continue
        else:
            tmp_set = set([])
            for parent, dir_names, file_names in os.walk(
                    source_dir
            ):  # os.walk遍历该目录,返回三元组{当前文件夹路径,当前文件夹中文件夹,当前文件夹中文件}
                pattern = re.match(r'.*/([0-9]{8})[/]{,1}$', parent)
                if pattern != None and os.stat(
                        parent
                ).st_mtime < last_mtime:  # 文件上次修改时间<系统上次启动时间,则说明已更新过
                    print('Directory: ' + parent + ' is too old, skipped')
                    continue
                for file_name in file_names:
                    if file_name[-4:] == 'json' and os.stat(
                            parent + '/' + file_name).st_mtime >= last_mtime:
                        tmp_set.add(parent + '/' + file_name)
            news_sets.append(tmp_set)

    # 2. 找到所有文本的相似文本,有的话就存放到/result
    for i in range(0, len(news_sets) - 1):
        for file_path in news_sets[i]:
            file_name = re.match(r".*/([-\w]+\.json)",
                                 file_path).group(1)  # 提取出文件名,'025453.json'
            print("---------------------------------")
            print("Searching for " + file_name + "'s similar passages ...")

            tags = ExtractTagsFromFile(
                file_path,
                num_of_tags)  # 子函数1: 传入文件和词数,返回前几个关键词tags=['','','']
            example_tf = GetTermFreqFromFile(
                tags, file_path
            )  # 子函数2: 传入关键词和文件,用tfidf返回对应词与权重example_tf={'':12, '':8}
            if example_tf == None:
                continue
            for j in range(i + 1, len(news_sets)):
                resfile = FindSimilarPassageFromSet(
                    news_sets[j], example_tf)  # 子函数3: 从其他平台新闻集找到相似文章,返回相似文件路径
                if resfile == None:
                    #print "No similar passage to " + file_name + " in "
                    continue
                else:
                    f = open(file_path, 'r')
                    js = json.load(f)
                    date = js['date']
                    newsId = js['newsId']
                    f.close()

                    result_path = output_dir + '/' + date + '/' + newsId + '/'
                    MakeDirectory(result_path)

                    if not os.path.exists(result_path + '/' + file_name):
                        CopyFile(file_path, result_path)
                    CopyFile(resfile, result_path)
                    print("found similar passage to " + file_name + ": " +
                          resfile)