Example #1
0
    sf = codecs.open(source_file_path, 'r', 'utf-8')
    df = codecs.open(dest_file_path, 'w', 'utf-8')
    
    cid_set = set()
    count = 0
    for line in sf:
        line = line.strip()
        row = line.split('[=]')
        if row[2] in topic_list and (not row[0] in cid_set):
            cid_set.add(row[0])
            df.write(line + '\n')
            count += 1
    print 'Comments loaded: %d' % len(cid_set)

#write_topic_info('TopicInfo', 'tables/ustv/TopicInfo-ustv-title.txt', 'tables/ustv/TopicInfo-raw-part-new.txt')
#write_comment_info('CommentInfo', 'tables/ustv/CommentInfo-raw-part.txt', 'tables/ustv/CommentInfo-raw-part-new.txt')

def write_comment_info2(topic_dict, source_file_path, dest_file_path):
    sf = codecs.open(source_file_path, 'r', 'utf-8')
    df = codecs.open(dest_file_path, 'w', 'utf-8')
    
    for line in sf:
        line = line.strip()
        row = line.split('[=]')
        if row[2] in topic_dict:
            df.write(line + '\n')
            
from prepare import load_topic
topic_dict = load_topic('tables/ustv/TopicInfo-raw-part-with-title.txt')
write_comment_info2(topic_dict, 'tables/ustv/CommentInfo-ustv-raw-part.txt', 'tables/ustv/CommentInfo-ustv-raw-part-new.txt')
#coding:utf8

"""
此脚本用户将目前已经抓取的topic id读入,然后从所有的topic list中除去,
生成还未抓取的新的topic list
"""

import datetime

from prepare import load_topic, load_comment

current_topic_dict = load_topic('tables/ustv/TopicInfo-ustv.txt')

f = open('tables/ustv/TopicList-ustv-all.txt', 'r')
fn = open('tables/ustv/TopicList-ustv-remain.txt', 'w')
for line in f:
    line = line.strip()
    if not line in current_topic_dict:
        fn.write(line + '\n')
        
f.close()
fn.close()