Beispiel #1
0
 def __init__(self):
     '''
     >>>from v2ex_spider import rss_spider
     >>>rss_spider.Rss_spider()
     '''
     logging.info('start Rss spider')
     self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml',
                'https://www.v2ex.com/feed/tab/qna.xml',
                'https://www.v2ex.com/feed/tab/jobs.xml',
                'https://www.v2ex.com/feed/tab/deals.xml',
                'https://www.v2ex.com/feed/tab/city.xml',
                'https://www.v2ex.com/feed/tab/play.xml',
                'https://www.v2ex.com/feed/tab/apple.xml',
                'https://www.v2ex.com/feed/tab/creative.xml',
                'https://www.v2ex.com/feed/tab/tech.xml']
     self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json']
     self.topic_sleep_time=10
     logging.debug('open sql database')
     self.SQ=SQL()
     self.SQ.open_datebase()
     self.redis_conn=Redis()
     self.load_config()
     #run
     try:
         self.latest_and_hot()
     except APIError as e:
         pass
     self.gen_topic_queue()
     #end
     self.SQ.close_datebase()
     logging.info('end the Rss spider')
Beispiel #2
0
 def __init__(self, url, sleep_time):
     '''
     >>>from v2ex_spider import base_spider
     >>>base_spider.start(url,sleep_time)
     '''
     self.url = url
     self.sleep_time = sleep_time
     time.sleep(int(self.sleep_time))
     self.SQ = SQL()
     self.SQ.open_datebase()
     #run
     self.load_config()
     self.spider()
     #end
     self.SQ.close_datebase()
Beispiel #3
0
 def __init__(self):
     '''
     $ python run.py
     or
     $ ./Run.sh
     '''
     logging.info('start')
     logging.debug('open sql database.')
     self.SQ = SQL()
     self.SQ.open_datebase()
     self.redis_conn = Redis()
     self.load_config()
     #base
     self.load_json()
     #         self.update_cookies()
     try:
         self.update_nodes()
     except APIError as e:
         pass
Beispiel #4
0
class spider(object):
    '''
    A base Spider for v2ex.
    '''
    def __init__(self, url, sleep_time):
        '''
        >>>from v2ex_spider import base_spider
        >>>base_spider.start(url,sleep_time)
        '''
        self.url = url
        self.sleep_time = sleep_time
        time.sleep(int(self.sleep_time))
        self.SQ = SQL()
        self.SQ.open_datebase()
        #run
        self.load_config()
        self.spider()
        #end
        self.SQ.close_datebase()

    def spider(self):
        resp = self.s.get(self.url)
        if resp.status_code != 200:
            self.SQ.close_datebase()
            error_info = 'proxy status: %s, proxy: %s' % (str(
                settings.proxy_enable), str(self.s.proxies))
            raise APIError(error_info)
        topics = resp.json()
        for topic in topics:
            t_id = topic["id"]
            title = topic["title"]
            author = topic["member"]["username"]
            author_id = topic["member"]["id"]
            content = topic["content"]
            content_rendered = topic["content_rendered"]
            replies = topic["replies"]
            node = topic["node"]["id"]
            created = topic["created"]
            n_time = int(time.time())
            self.SQ.write_to_db_base(t_id, title, author, author_id, content,
                                     content_rendered, replies, node, created,
                                     n_time)
        self.SQ.conn.commit()
        return

    def load_config(self):
        self.proxy_enable = settings.proxy_enable
        self.s = requests.session()
        self.s.headers = settings.API_headers
        if self.proxy_enable:
            self.s.proxies = settings.proxies
        return
Beispiel #5
0
 def __init__(self):
     '''
     $ python run.py
     or
     $ ./Run.sh
     '''
     self.SQ = SQL()
     self.SQ.open_datebase()
     self.redis_conn = Redis()
     self.load_config()
     #start
     self.load_json()
     self.update_cookies()
     try:
         self.update_nodes()
     except APIError as e:
         print(e)
     self.get_rss()
     self.tasker()
     self.tester_tasker()
     #end
     self.end()
Beispiel #6
0
class Start(object):
    '''
    Start the project.
    '''
    def __init__(self):
        '''
        $ python run.py
        or
        $ ./Run.sh
        '''
        logging.info('start')
        logging.debug('open sql database.')
        self.SQ = SQL()
        self.SQ.open_datebase()
        self.redis_conn = Redis()
        self.load_config()
        #base
        self.load_json()
        #         self.update_cookies()
        try:
            self.update_nodes()
        except APIError as e:
            pass

    def Mode1(self):
        logging.info('start mode1')
        #start
        self.get_rss()
        self.tasker()
        self.topic_ids_enqueue()
        self.tester_tasker()
        #end
        self.end()

    def Mode2(self):
        logging.info('start mode2')
        #start

        self.get_rss()
        self.topic_ids_enqueue()
        self.tester_tasker()
        #end
        self.end()

    def end(self):
        self.SQ.close_datebase()
        self.dump_json()
        logging.info('end')

    def load_json(self):
        logging.debug('load json')
        #load .time_log.json
        if os.path.exists('.time_log.json'):
            with open('.time_log.json', 'r') as f:
                self.time_log = json.load(f)
        else:
            self.time_log = {
                'cookies_time': '0',
                'nodes_time': '0',
                '8000_node': '0',
                '4000_node': '0',
                '1000_node': '0',
                '500_node': '0',
                '0_node': '0',
                'rss_time': '0',
                'tester': '0',
                'topic_id_reenqueue': '0'
            }
        #load .node_number.json
        if os.path.exists('.node_number.json'):
            with open('.node_number.json', 'r') as f:
                self.node_number = json.load(f)
        else:
            self.node_number = list()
        return

    def dump_json(self):
        #dump .time_log.json
        with open('.time_log.json', 'w') as f1:
            json.dump(self.time_log, f1)
        #dump .node_number.json
        with open('.node_number.json', 'w') as f2:
            self.node_number = list(set(self.node_number))
            json.dump(self.node_number, f2)
        return

    def topic_ids_enqueue(self):
        if int(time.time()) - int(self.time_log['topic_id_reenqueue']) >= 1800:
            logging.info('start topic id reenqueue')
            max_id = topic_id_reenqueue.max_id
            topic_id_reenqueue.reenqueue_m(max_id - 2000, max_id - 29)
            self.time_log['topic_id_reenqueue'] = str(int(time.time()))
        return

    def update_cookies(self):
        if int(time.time()) - int(self.time_log["cookies_time"]) >= 86400 * 4:
            cookies_time_status = False
        else:
            cookies_time_status = True
        if not os.path.exists('cookies.txt') or cookies_time_status is False:
            logging.debug('update cookies')
            try:
                log_s = log_in.v2ex_log_in()
                log_s.log_in(3)
                log_s.save_cookies()
            except log_in.LogError as e:
                return
            self.time_log["cookies_time"] = str(int(time.time()))
        return

    def update_nodes(self):
        if int(time.time()) - int(self.time_log["nodes_time"]) >= 10800:
            nodes_time_status = False
        else:
            nodes_time_status = True
        if not nodes_time_status:
            logging.info('update nodes')
            try:
                resp = self.s.get('https://www.v2ex.com/api/nodes/all.json',
                                  timeout=10)
            except requests.exceptions.RequestException as e:
                logging.error('update_node failed.')
                logging.error('proxy_status: %s' % settings.i_proxy_enable)
                if settings.i_proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(e)
                self.node_number = list(set(self.node_number))
                return
            if resp.status_code != 200:
                logging.error('update_node failed.')
                logging.error('proxy_status: %s' % settings.i_proxy_enable)
                if settings.i_proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(APIError('update_node'))
                self.node_number = list(set(self.node_number))
                raise APIError('update_node')
            nodes = resp.json()
            for node in nodes:
                n_id = node["id"]
                name = node["name"]
                url = node["url"]
                title = node["title"]
                title_alternative = node["title_alternative"]
                topics = node["topics"]
                header = node["header"]
                footer = node["footer"]
                created = node["created"]
                n_time = int(time.time())
                if self.SQ.node_test(n_id, topics) is True:
                    self.node_number.append(int(n_id))
                self.SQ.write_to_db_node(n_id, name, url, title,
                                         title_alternative, topics, header,
                                         footer, created, n_time)
            self.time_log["nodes_time"] = str(int(time.time()))
        self.node_number = list(set(self.node_number))
        return

    def tasker(self):
        node_configs_1 = [{
            'sql': 'SELECT ID FROM NODES WHERE topics >= 8000;',
            'sleep_time': 5,
            'between_time': 900,
            'time_log': '8000_node',
            'queue_name': 'node1'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;',
            'sleep_time': 10,
            'between_time': 1800,
            'time_log': '4000_node',
            'queue_name': 'node2'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;',
            'sleep_time': 20,
            'between_time': 7200,
            'time_log': '1000_node',
            'queue_name': 'node3'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '500_node',
            'queue_name': 'node4'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '0_node',
            'queue_name': 'node5'
        }]
        node_configs_2 = [{
            'sql': 'SELECT ID FROM NODES WHERE topics >= 8000;',
            'sleep_time': 5,
            'between_time': 1800,
            'time_log': '8000_node',
            'queue_name': 'node1'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;',
            'sleep_time': 10,
            'between_time': 3600,
            'time_log': '4000_node',
            'queue_name': 'node2'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;',
            'sleep_time': 20,
            'between_time': 14400,
            'time_log': '1000_node',
            'queue_name': 'node3'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '500_node',
            'queue_name': 'node4'
        }, {
            'sql': 'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;',
            'sleep_time': 90,
            'between_time': 86400,
            'time_log': '0_node',
            'queue_name': 'node5'
        }]
        time.tzname = ('CST', 'CST')
        if int(time.strftime('%H')) >= 8 or int(time.strftime('%H')) < 2:
            node_configs = node_configs_1
        else:
            node_configs = node_configs_2
        for node_config in node_configs:
            sql = node_config['sql']
            sleep_time = node_config['sleep_time']
            between_time = node_config['between_time']
            time_log_name = node_config['time_log']
            queue_name = node_config['queue_name']
            q_node = Queue(queue_name, connection=self.redis_conn)
            if int(time.time()) - int(
                    self.time_log[time_log_name]) >= between_time:
                logging.info('start enqueue, queue name: %s' % queue_name)
                self.SQ.cursor.execute(sql)
                node_ids = self.SQ.cursor.fetchall()
                for node_id in node_ids:
                    node_id = node_id[0]
                    if queue_name not in [
                            'node4', 'node5'
                    ] or (queue_name in ['node4', 'node5']
                          and node_id in self.node_number):
                        if queue_name in ['node4', 'node5']:
                            self.node_number.remove(int(node_id))
                        q_node.enqueue(node_spider.start, node_id, sleep_time)
                self.time_log[time_log_name] = str(int(time.time()))
        return

    def get_rss(self):
        if int(time.time()) - int(self.time_log["rss_time"]) >= 600:
            logging.debug('start get_rss')
            try:
                rss_spider.Rss_spider()
            except requests.exceptions.RequestException as e:
                self.time_log["rss_time"] = str(int(time.time()))
                return
            self.time_log["rss_time"] = str(int(time.time()))
        return

    def load_config(self):
        logging.debug('load config')
        self.proxy_enable = settings.i_proxy_enable
        self.s = requests.session()
        self.s.headers = settings.API_headers
        if self.proxy_enable:
            self.s.proxies = settings.i_proxies()
        return

    def tester_tasker(self):
        if int(time.time()) - int(self.time_log["tester"]) >= 1800:
            logging.info('start enqueue tester')
            #losd json
            if os.path.exists('.topics_tester.json'):
                with open('.topics_tester.json', 'r') as f:
                    tmp_topics = json.load(f)
            else:
                tmp_topics = list()
            #main
            sql = "SELECT ID FROM TOPIC WHERE (time - created) < 345600 AND ID NOT IN (SELECT T_ID FROM STATUS) AND (STRFTIME('%s','now') - created) > 1209600;"
            sleep_time = 20
            self.SQ.cursor.execute(sql)
            topic_ids = [x[0] for x in self.SQ.cursor.fetchall()]
            q = Queue('tester', connection=self.redis_conn)
            for topic_id in topic_ids:
                if topic_id not in tmp_topics:
                    q.enqueue(topic_tester.start, topic_id, sleep_time)
                    tmp_topics.append(topic_id)
            #end
            tmp_topics = list(set(tmp_topics))
            with open('.topics_tester.json', 'w') as f:
                json.dump(tmp_topics, f)
            self.time_log["tester"] = str(int(time.time()))
        return
Beispiel #7
0
 def init_database(self):
     logging.debug('init database')
     self.SQ = SQL()
     self.SQ.open_datebase()
Beispiel #8
0
class tester(object):
    '''
    The tester for v2ex topics.
    '''
    def __init__(self):
        '''
        >>>from v2ex_tester import topic_tester
        >>>topic_tester(topic_id,sleep_time)
        '''
        logging.debug('init class tester')
        self.s = requests.session()
        if settings.proxy_enable is True:
            self.s.proxies = settings.proxies()
        self.s.headers = settings.WEB_headers
        self.log_status = False

    def init_database(self):
        logging.debug('init database')
        self.SQ = SQL()
        self.SQ.open_datebase()

    def log_in(self):
        logging.debug('log in account')
        with open('.cookies.json', 'r') as f:
            cookies = requests.utils.cookiejar_from_dict(json.load(f))
            self.s.cookies = cookies
        self.s.headers = settings.WEB_headers
        self.log_status = True
        return

    def web_test(self, t_id, status):
        logging.debug('Start web_test')
        url = 'https://www.v2ex.com/t/%s' % str(t_id)
        n_time = int(time.time())
        try:
            resp = self.s.get(url, timeout=10)
        except requests.exceptions.RequestException as e:
            logging.error('web_test failed.')
            logging.error('proxy_status: %s' % settings.proxy_enable)
            if settings.proxy_enable is True:
                logging.error('proxy: %s' % self.s.proxies)
            logging.error(e)
            raise e
        if resp.status_code == 403:
            error_info = 'proxy status: %s, proxy: %s' % (str(
                settings.proxy_enable), str(self.s.proxies))
            logging.error('API Error: proxy status: %s, proxy: %s' %
                          (str(settings.proxy_enable), str(self.s.proxies)))
            raise APIError(error_info)
        if resp.status_code == 404 and '404 Topic Not Found' in resp.text:
            return {
                'T_ID': int(t_id),
                'NODE': None,
                'STATUS': 3,
                'TIME': n_time
            }
        if resp.url == 'https://www.v2ex.com/':
            return self.api_test(t_id, status=2)
        if 'signin' in resp.url and self.log_status is False:
            #             self.log_in()
            #             return self.web_test(t_id, status=1)
            return self.api_test(t_id, status=1)
        tree = etree.HTML(resp.text)
        node_name = re.findall(
            r'\/go\/(\w+)',
            tree.xpath('//div[@class="header"]/a[2]/@href')[0])[0]
        self.SQ.cursor.execute("SELECT ID FROM NODES WHERE name == '%s';" %
                               node_name)
        node_id = self.SQ.cursor.fetchone()[0]
        return {
            'T_ID': int(t_id),
            'NODE': node_id,
            'STATUS': status,
            'TIME': n_time
        }

    def api_test(self, t_id, status):
        logging.debug('Start api_test')
        self.s_a = requests.session()
        if settings.proxy_enable is True:
            self.s_a.proxies = settings.proxies()
        self.s_a.headers = settings.API_headers
        url = 'https://www.v2ex.com/api/topics/show.json?id=%s' % str(t_id)
        n_time = int(time.time())
        try:
            resp = self.s_a.get(url, timeout=10)
        except requests.exceptions.RequestException as e:
            logging.error('api_test failed.')
            logging.error('proxy_status: %s' % settings.proxy_enable)
            if settings.proxy_enable is True:
                logging.error('proxy: %s' % self.s.proxies)
            logging.error(e)
            raise e
        if resp.status_code != 200:
            error_info = 'proxy status: %s, proxy: %s' % (str(
                settings.proxy_enable), str(self.s.proxies))
            logging.error('API Error: proxy status: %s, proxy: %s' %
                          (str(settings.proxy_enable), str(self.s.proxies)))
            raise APIError(error_info)
        if len(resp.json()) == 0:
            return {
                'T_ID': int(t_id),
                'NODE': None,
                'STATUS': 3,
                'TIME': n_time
            }
        topic = resp.json()[0]
        node_id = topic["node"]["id"]
        return {
            'T_ID': int(t_id),
            'NODE': node_id,
            'STATUS': status,
            'TIME': n_time
        }

    def write_to_sql(self, T_ID, NODE, STATUS, TIME):
        self.SQ.write_to_db_status(T_ID, NODE, STATUS, TIME)
        return
Beispiel #9
0
class Rss_spider(object):
    '''
    A Spider for v2ex's Rss.
    Get the latest and hot topic on the index.
    Using the rss generate the topic list that need to spider.
    '''


    def __init__(self):
        '''
        >>>from v2ex_spider import rss_spider
        >>>rss_spider.Rss_spider()
        '''
        logging.info('start Rss spider')
        self.v2ex_rss_url_list=['https://www.v2ex.com/index.xml',
                   'https://www.v2ex.com/feed/tab/qna.xml',
                   'https://www.v2ex.com/feed/tab/jobs.xml',
                   'https://www.v2ex.com/feed/tab/deals.xml',
                   'https://www.v2ex.com/feed/tab/city.xml',
                   'https://www.v2ex.com/feed/tab/play.xml',
                   'https://www.v2ex.com/feed/tab/apple.xml',
                   'https://www.v2ex.com/feed/tab/creative.xml',
                   'https://www.v2ex.com/feed/tab/tech.xml']
        self.latest_hot_api=['https://www.v2ex.com/api/topics/latest.json','https://www.v2ex.com/api/topics/hot.json']
        self.topic_sleep_time=10
        logging.debug('open sql database')
        self.SQ=SQL()
        self.SQ.open_datebase()
        self.redis_conn=Redis()
        self.load_config()
        #run
        try:
            self.latest_and_hot()
        except APIError as e:
            pass
        self.gen_topic_queue()
        #end
        self.SQ.close_datebase()
        logging.info('end the Rss spider')
    
    def topics_id_rss(self):
        logging.debug('fetch rss feeds')
        topic_ids=list()
        for v2ex_rss_url in self.v2ex_rss_url_list:
            feed=feedparser.parse(v2ex_rss_url)
            logging.debug('fetch rss feed: %s' % v2ex_rss_url)
            items=feed["items"]
            for item in items:
                author=item["author"]
                title=item["title"]
                link=item["link"]
                published=item[ "date" ] 
                summary=item["summary"]
                topic_id=int(re.findall(r't\/(\d+)#?', link)[0])
                topic_ids.append(topic_id)
        topic_ids=set(topic_ids)
        return topic_ids

    def topics_id_sqlite(self):
        logging.debug('SELECT ID FROM TOPIC')
        sql='SELECT ID FROM TOPIC;'
        self.SQ.cursor.execute(sql)
        topics_ids=[x[0] for x in self.SQ.cursor.fetchall()]
        return  topics_ids
    
    def latest_and_hot(self):
        logging.debug('start latest_and_hot')
        for url in self.latest_hot_api:
            try:
                resp=self.s.get(url, timeout=10)
            except requests.exceptions.RequestException as e:
                logging.error('latest_and_hot error')
                logging.error('proxy_status: %s' % self.proxy_enable)
                if self.proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(e)
                raise e
            if resp.status_code != 200:
                logging.error('latest_and_hot error')
                logging.error('proxy_status: %s' % self.proxy_enable)
                if self.proxy_enable is True:
                    logging.error('proxy: %s' % self.s.proxies)
                logging.error(APIError('latest_and_hot'))
                raise APIError('latest_and_hot')
            topics=resp.json()
            for topic in topics:
                t_id=topic["id"]
                title=topic["title"]
                author=topic["member"]["username"]
                author_id=topic["member"]["id"]
                content=topic["content"]
                content_rendered=topic["content_rendered"]
                replies=topic["replies"]
                node=topic["node"]["id"]
                created=topic["created"]
                n_time=int(time.time())
                self.SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)
            self.SQ.conn.commit()
        return

    def gen_topic_queue(self):
        logging.debug('start topic enqueue')
        topics_sql=self.topics_id_sqlite()
        if len(topics_sql) <= 2000:
            return
        topics_rss=self.topics_id_rss()
        # load topics
        if os.path.exists('.topics_all.json'):
            with open('.topics_all.json','r') as f:
                tmp_topics=json.load(f)
        else:
            tmp_topics=list()
        t_queue=Queue('topic',connection=self.redis_conn)
        # gen queue
        for topic in topics_rss:
            if topic not in topics_sql and topic not in tmp_topics:
                topic_id=int(topic)
                t_queue.enqueue(topic_spider.start,topic_id, self.topic_sleep_time)
        #save topics
        topics_all=list()
        topics_all.extend(tmp_topics)
        topics_all.extend(topics_rss)
        topics_all.extend(topics_sql)
        topics_all=list(set(topics_all))
        with open('.topics_all.json','w') as f:
            json.dump(topics_all, f)
        return

    def load_config(self):
        logging.debug('load config')
        self.proxy_enable=settings.i_proxy_enable
        self.s=requests.session()
        self.s.headers=settings.API_headers
        if self.proxy_enable:
            self.s.proxies=settings.i_proxies()
Beispiel #10
0
 def init_database(self):
     self.SQ=SQL()
     self.SQ.open_datebase()