Ejemplo n.º 1
0
    def load_collect(self, page):
        """
        load collect
        """
        version = begin_time()
        if not os.path.exists('%scookie_collect' % data_dir):
            print('TB cookie not exist!!!')
            return
        with codecs.open('%scookie_collect' % data_dir, 'r',
                         encoding='utf-8') as f:
            cookie = f.readline()
        changeCookie(cookie[:-1])
        changeHtmlTimeout(30)
        for block in range(page // 10 + 1):
            begin = block * 10
            end = min(page, (block + 1) * 10)
            threadings = []
            for index in range(begin, end):
                work = threading.Thread(target=self.load_collect_once,
                                        args=(index, ))
                threadings.append(work)
            for work in threadings:
                work.start()
            for work in threadings:
                work.join()

        collect = [self.collect[k] for k in sorted(self.collect.keys())]
        collect = sum(collect, [])
        with codecs.open('%scollect_wyy' % data_dir, 'w',
                         encoding='utf-8') as f:
            f.write("\n".join(collect))
        end_time(version)
Ejemplo n.º 2
0
    def load_goods(self):
        """
        load goods
        """
        version = begin_time()
        if not os.path.exists('%scookie' % data_dir):
            print('Youdao Note cookie not exist!!!')
            return
        with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
            cookie = f.readline()
        changeCookie(cookie[:-1])

        threadings = []
        for index, tid in enumerate(self.request_list):
            work = threading.Thread(target=self.load_goods_once,
                                    args=(
                                        index,
                                        tid,
                                    ))
            threadings.append(work)

        for work in threadings:
            work.start()
        for work in threadings:
            work.join()

        goods = [self.goods[k] for k in sorted(self.goods.keys())]
        goods = sum(goods, [])
        with codecs.open('%sgoods' % data_dir, 'w', encoding='utf-8') as f:
            f.write("\n".join(goods))
        end_time(version)
Ejemplo n.º 3
0
    def getZhihuView(self):
        if os.path.exists('%scookie' % data_dir):
            with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f:
                cookie = f.readline()
        else:
            cookie = ' '
        changeCookie(cookie[:-1])
        url_basic = [
            'https://www.zhihu.com/api/v4/creator/content_statistics/',
            'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
            datetime.datetime.now().strftime("%Y-%m-%d"),
            '&page_no='
        ]
        url = "".join(url_basic)
        json = self.get_request(url + '1', 1)
        if not json:
            return
        if not 'data' in json:
            if 'code' in json:
                print(json)
            return
        for index in json['data']:
            zhihu_title = index['title']
            zhihu_id = int(index['url_token'])
            zhihu_count = int(index['read_count'])

            if zhihu_title in self.title2slug:
                temp_slug = self.title2slug[zhihu_title]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            elif zhihu_id in self.zhihu_id_map:
                temp_slug = self.zhihu_id_map[zhihu_id]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            else:
                print(index['title'])

        for index in range(json['count'] // 10):
            print('zhihu', index)
            json = self.get_request(url + str(index + 2), 1)
            if not json:
                continue
            for index in json['data']:
                zhihu_title = index['title']
                zhihu_id = int(index['url_token'])
                zhihu_count = int(index['read_count'])

                if zhihu_title in self.title2slug:
                    temp_slug = self.title2slug[zhihu_title]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                elif zhihu_id in self.zhihu_id_map:
                    temp_slug = self.zhihu_id_map[zhihu_id]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                else:
                    print(index['title'])
Ejemplo n.º 4
0
    def getZhihuView(self):
        cookie = ''.join(read_file('{}cookie'.format(data_dir)))
        changeCookie(cookie)
        url_basic = [
            self.ZHIHU_URL,
            'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
            datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no='
        ]
        url = ''.join(url_basic)

        json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i)
        if not json:
            return
        if not 'data' in json:
            if 'code' in json:
                echo('0|warning', json)
            return
        echo(3, 'zhihu', json)
        for index in json['data']:
            zhihu_title = index['title']
            zhihu_id = int(index['url_token'])
            zhihu_count = int(index['read_count'])

            if zhihu_title in self.title2slug:
                temp_slug = self.title2slug[zhihu_title]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            elif zhihu_id in self.zhihu_id_map:
                temp_slug = self.zhihu_id_map[zhihu_id]
                self.zhihu_id[temp_slug] = zhihu_id
                self.zhihu_views[temp_slug] = zhihu_count
            else:
                echo('0|debug', index['title'])

        for index in range(1, json['count'] // 10):
            echo(1, 'zhihu', index)
            json = self.get_request('{}{}'.format(url, 1 + index), 1,
                                    lambda i: not i)
            echo(2, 'zhihu', json)
            if not json:
                continue
            for index in json['data']:
                zhihu_title = index['title']
                zhihu_id = int(index['url_token'])
                zhihu_count = int(index['read_count'])

                if zhihu_title in self.title2slug:
                    temp_slug = self.title2slug[zhihu_title]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                elif zhihu_id in self.zhihu_id_map:
                    temp_slug = self.zhihu_id_map[zhihu_id]
                    self.zhihu_id[temp_slug] = zhihu_id
                    self.zhihu_views[temp_slug] = zhihu_count
                else:
                    echo('0|debug', index['title'])