Example #1
0
def main(topicFile='topics_1.txt'):
    dump = ''
    topics = open('./' + topicFile, 'r')
    for line in topics:
        line = line.strip()
        if not os.path.exists('./' + line.strip()):
            os.makedirs('./' + line.strip())
        print '[+] DUMPING ' + line.strip() + '...'
        dump += Zhihu.dumpTopic(line.strip())
    topics.close()
    dump_spl = dump.split('\n\n:\n')
    #KEY (topicID+'\n'+posttype+'\n'+str(postid)+'\n'+question_url+'\n'+''+'\n'+title+'\n'+content+'\n\n')
    #KEY (topicID+'\n'+posttype+'\n'+str(postid)+'\n'+url+'\n'+author+'\n'+question_title+'\n'+ans_content+'\n\n')
    #KEY (topicID+'\n'+posttype+'\n'+str(postid)+'\n'+url+'\n'+author+'\n'+title+'\n'+art_content+'\n\n')

    for item in dump_spl:
        linespl = item.split('\n')
        if len(linespl) > 6:
            if not os.path.exists('./' + linespl[0] + '/' +
                                  linespl[2]) and len(linespl[0]) == 8:
                topicid = linespl[0]
                posttype = linespl[1]
                postid = linespl[2]
                url = linespl[3]
                author = linespl[4]
                title = linespl[5]
                content = linespl[6]

                path = './' + topicid + '/' + postid
                textfile2 = codecs.open(path, 'a', encoding='utf-8')
                write = topicid + '\n' + posttype + '\n' + postid + '\n' + url + '\n' + author + '\n' + title + '\n\n' + content
                textfile2.write(write)
                textfile2.close()
Example #2
0
def check():
    f = open('./topics_3.txt', 'r')
    CHECK = 'POTENTIALLY CENSORED: \n\n'
    for line in f:
        print 'CHECKING TOPIC: ' + line.strip()
        path = './' + line.strip() + '/'
        for check in os.listdir(path):

            time.sleep(2)
            f = open(path + check.strip(), 'r')
            spl = f.read().split('\n')
            Type = spl[1].strip()
            link = spl[3]

            if Type == 'article':
                res = Zhihu.get_url_zl(link)
                if not res.status_code == 200:
                    print Type + ' : ' + str(
                        res.status_code) + ' : ' + link + '\n'
                    CHECK += Type + ' : ' + str(
                        res.status_code) + ' : ' + link + '\n'
                    if not os.path.exists('./Censored/' +
                                          path.strip().replace('./', '')):
                        os.makedirs('./Censored/' +
                                    path.strip().replace('./', ''))
                    shutil.copy2(
                        path + check.strip(),
                        './Censored/' + path.strip().replace('./', ''))
            else:
                res = Zhihu.get_url(link)
                if not res.status_code == 200:
                    print Type + ' : ' + str(
                        res.status_code) + ' : ' + link + '\n'
                    CHECK += Type + ' : ' + str(
                        res.status_code) + ' : ' + link + '\n'
                    if not os.path.exists('./Censored/' +
                                          path.strip().replace('./', '')):
                        os.makedirs('./Censored/' +
                                    path.strip().replace('./', ''))
                    shutil.copy2(
                        path + check.strip(),
                        './Censored/' + path.strip().replace('./', ''))

    return CHECK
Example #3
0
def printCensoredByTopic():
    total_Posts = totalPosts()
    total_Censored = totalCensored()
    byTopic = censoredByTopic()
    print 'Topic Name - Censored Posts'
    for k, v in byTopic.iteritems():
        number_posts = len([name for name in os.listdir('./' + k.strip())])
        perc_censored = float((v / total_Censored)) * 100
        perc_topic = float((v / number_posts)) * 100
        print str(k) + ' (' + Zhihu.getTopicName(k) + ') - ' + str(v)