__author__ = 'PatrickYeh'

import receiver
import pickle
from pttcrawler import Logger
from kafka import KafkaProducer

log = Logger.getLogger("kafka_producer")

class kafka_producer(receiver.receiver):
    def __init__(self):
        self.topic = "Message"

    def set_kafka_client(self,host,port):
        self.producer = KafkaProducer(bootstrap_servers="{host}:{port}".format(host=host,port=port))
    def set_topic(self,topic):
        self.topic = topic

    def send(self,obj_data):
        log.debug("Broadcast Data")

        self.producer.send(self.topic,obj_data)

class article_producer(kafka_producer):
    def __init__(self):
        self.set_topic("ptt_article")

class reply_producer(kafka_producer):
    def __init__(self):
        self.set_topic("ptt_reply")
Example #2
0
# coding=utf-8
__author__ = 'PatrickYeh'

import re

from pttcrawler import Logger
from pttcrawler.WebRetriever import WebRetriever
from pttcrawler.Page import Page
from pttcrawler.Article import Article
from BeautifulSoup import BeautifulSoup
log = Logger.getLogger("PttBoard")

PAGE_REG = ".*index(?P<page_num>\d*).html"
ARTICLE_URL_REG = ".*/(?P<article_id>.*).html"
article_url_pattern = re.compile(ARTICLE_URL_REG)
class Board(Page):
    def __init__(self,board_id="Gossiping"):
        self.base_url = 'https://www.ptt.cc/bbs/{board_id}/index{page_idx}.html'
        self.board_id = board_id
        self.refresh()

    def refresh(self):
        self.html_raw_soup = self._fetch_data(self.url)

    def _article_list_iter(self,lst_article_idx):
        for article_idx in lst_article_idx:
            yield Article(board_id=self.board_id,article_id=article_idx)

    def get_articles(self,lst_article_list):
        return self._article_list_iter(lst_article_list)
Example #3
0
__author__ = 'PatrickYeh'
import threading,time,pickle,json
from pttcrawler import Logger
from pttcrawler.Board import Board
from pttcrawler.Article import Article
import kafka_producer
log = Logger.getLogger("Monitor")

class monitor(threading.Thread):
    def __init__(self):
        self.dict_receiver = {}

    def set_receiver(self,name,obj_receiver):
        self.dict_receiver[name] = obj_receiver

    def get_receiver(self,name):
        return self.dict_receiver[name]


    def send(self,name,obj_data):
        self.dict_receiver[name].send(obj_data)

    def broadcast(self,obj_data):
        for key in self.dict_receiver.keys():
            self.dict_receiver[key].send(obj_data)


class article_monitor(monitor):
    def __init__(self,board_id,article_id):
        threading.Thread.__init__(self)
        monitor.__init__(self)
# coding=utf-8
__author__ = 'Vetom'

import requests,time
from BeautifulSoup import BeautifulSoup
from pttcrawler import Logger


requests.packages.urllib3.disable_warnings()

log = Logger.getLogger("WebRetriever")

class WebRetriever():
    def make_request(self,str_url):
        log.debug("Make Query: {url}".format(url=str_url))
        bool_pass = False
        while not bool_pass:
            try:
                html_raw = requests.get(str_url,verify=False,cookies={'over18':'1'})
                html_raw.encoding = 'utf-8'
                bool_pass = True
            except Exception as e:
                log.error("URL:{url} - ERROR: {err}".format(url=str_url,err=e))
                time.sleep(2)

        return BeautifulSoup(html_raw.text)


if __name__ == '__main__':
    data = WebRetriever().make_request('https://www.ptt.cc/bbs/joke/index.html')
    print data