sf = codecs.open(source_file_path, 'r', 'utf-8') df = codecs.open(dest_file_path, 'w', 'utf-8') cid_set = set() count = 0 for line in sf: line = line.strip() row = line.split('[=]') if row[2] in topic_list and (not row[0] in cid_set): cid_set.add(row[0]) df.write(line + '\n') count += 1 print 'Comments loaded: %d' % len(cid_set) #write_topic_info('TopicInfo', 'tables/ustv/TopicInfo-ustv-title.txt', 'tables/ustv/TopicInfo-raw-part-new.txt') #write_comment_info('CommentInfo', 'tables/ustv/CommentInfo-raw-part.txt', 'tables/ustv/CommentInfo-raw-part-new.txt') def write_comment_info2(topic_dict, source_file_path, dest_file_path): sf = codecs.open(source_file_path, 'r', 'utf-8') df = codecs.open(dest_file_path, 'w', 'utf-8') for line in sf: line = line.strip() row = line.split('[=]') if row[2] in topic_dict: df.write(line + '\n') from prepare import load_topic topic_dict = load_topic('tables/ustv/TopicInfo-raw-part-with-title.txt') write_comment_info2(topic_dict, 'tables/ustv/CommentInfo-ustv-raw-part.txt', 'tables/ustv/CommentInfo-ustv-raw-part-new.txt')
#coding:utf8 """ 此脚本用户将目前已经抓取的topic id读入,然后从所有的topic list中除去, 生成还未抓取的新的topic list """ import datetime from prepare import load_topic, load_comment current_topic_dict = load_topic('tables/ustv/TopicInfo-ustv.txt') f = open('tables/ustv/TopicList-ustv-all.txt', 'r') fn = open('tables/ustv/TopicList-ustv-remain.txt', 'w') for line in f: line = line.strip() if not line in current_topic_dict: fn.write(line + '\n') f.close() fn.close()