def main(topicFile='topics_1.txt'): dump = '' topics = open('./' + topicFile, 'r') for line in topics: line = line.strip() if not os.path.exists('./' + line.strip()): os.makedirs('./' + line.strip()) print '[+] DUMPING ' + line.strip() + '...' dump += Zhihu.dumpTopic(line.strip()) topics.close() dump_spl = dump.split('\n\n:\n') #KEY (topicID+'\n'+posttype+'\n'+str(postid)+'\n'+question_url+'\n'+''+'\n'+title+'\n'+content+'\n\n') #KEY (topicID+'\n'+posttype+'\n'+str(postid)+'\n'+url+'\n'+author+'\n'+question_title+'\n'+ans_content+'\n\n') #KEY (topicID+'\n'+posttype+'\n'+str(postid)+'\n'+url+'\n'+author+'\n'+title+'\n'+art_content+'\n\n') for item in dump_spl: linespl = item.split('\n') if len(linespl) > 6: if not os.path.exists('./' + linespl[0] + '/' + linespl[2]) and len(linespl[0]) == 8: topicid = linespl[0] posttype = linespl[1] postid = linespl[2] url = linespl[3] author = linespl[4] title = linespl[5] content = linespl[6] path = './' + topicid + '/' + postid textfile2 = codecs.open(path, 'a', encoding='utf-8') write = topicid + '\n' + posttype + '\n' + postid + '\n' + url + '\n' + author + '\n' + title + '\n\n' + content textfile2.write(write) textfile2.close()
def check(): f = open('./topics_3.txt', 'r') CHECK = 'POTENTIALLY CENSORED: \n\n' for line in f: print 'CHECKING TOPIC: ' + line.strip() path = './' + line.strip() + '/' for check in os.listdir(path): time.sleep(2) f = open(path + check.strip(), 'r') spl = f.read().split('\n') Type = spl[1].strip() link = spl[3] if Type == 'article': res = Zhihu.get_url_zl(link) if not res.status_code == 200: print Type + ' : ' + str( res.status_code) + ' : ' + link + '\n' CHECK += Type + ' : ' + str( res.status_code) + ' : ' + link + '\n' if not os.path.exists('./Censored/' + path.strip().replace('./', '')): os.makedirs('./Censored/' + path.strip().replace('./', '')) shutil.copy2( path + check.strip(), './Censored/' + path.strip().replace('./', '')) else: res = Zhihu.get_url(link) if not res.status_code == 200: print Type + ' : ' + str( res.status_code) + ' : ' + link + '\n' CHECK += Type + ' : ' + str( res.status_code) + ' : ' + link + '\n' if not os.path.exists('./Censored/' + path.strip().replace('./', '')): os.makedirs('./Censored/' + path.strip().replace('./', '')) shutil.copy2( path + check.strip(), './Censored/' + path.strip().replace('./', '')) return CHECK
def printCensoredByTopic(): total_Posts = totalPosts() total_Censored = totalCensored() byTopic = censoredByTopic() print 'Topic Name - Censored Posts' for k, v in byTopic.iteritems(): number_posts = len([name for name in os.listdir('./' + k.strip())]) perc_censored = float((v / total_Censored)) * 100 perc_topic = float((v / number_posts)) * 100 print str(k) + ' (' + Zhihu.getTopicName(k) + ') - ' + str(v)