def process_item(self, item):
     if item in self.bf:
         logger.info('[%s] is already in bloom.' % item)
         return None
     else:
         print('add one')
         self.bf.add(item)
         self.bf.tofile(open(self.bloom_path, 'wb'))
         return item
def crawl(search_word, page_num):
    for i in range(1, page_num + 1):
        try:
            list_url = 'https://www.zimeika.com/video/lists/haokan.html?cate_id=&time_type=&read_order=&type=&author_id=&author_name=&title=%s&p=%d' % (
                search_word, i)
            print('list_url = %s' % list_url)
            parse_list(list_url)
        except Exception as e:
            logger.info(e)
            continue
def parse_list(list_url):
    html = requests.get(list_url, headers=headers).text
    text = etree.HTML(html)
    detail_urls = text.xpath("//ul[@class='video-list']/li/a/@href")
    for url in detail_urls:
        try:
            detail_url = 'https://www.zimeika.com' + url
            parse_detail(detail_url)
        except Exception as e:
            logger.info(e)
            continue
def parse_list_with_selenium(list_url):
    driver = webdriver.PhantomJS()
    driver.get(list_url)
    details = driver.find_elements_by_xpath("//ul[@class='video-list']/li/a")
    # driver.close()
    for detail in details:
        try:
            detail_url = detail.get_attribute('href')
            parse_detail(detail_url)
        except Exception as e:
            logger.info(e)
            continue
def parse_detail(detail_url):
    bfu = MyBloomUtil('zimeika_crawler')
    processed_detail_url = bfu.process_item(detail_url)
    if not processed_detail_url:
        logger.info('%s has already been crawled.' % detail_url)
        return
    html = requests.get(processed_detail_url, headers=headers).text
    text = etree.HTML(html)
    title = text.xpath("//h1[@class='article-title']/a/text()")[0]
    video_url = text.xpath(
        "//article/div[@class='content-text']/div[@class='thumbnail']/a/@href"
    )[0]
    print('title = %s' % title)
    print('video_url = %s' % video_url)
    # download_video(title, video_url)
    save_info(title, video_url)
Example #6
0
import multiprocessing
import configparser
import grpc

APP_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(APP_DIR)
sys.path.append(os.path.join(APP_DIR, "grpc_file"))

from utils.log_util import logger

from grpc_file import side_feature_pb2_grpc
from service.side_feature_service import SideFeatureService

config_ini_dict = configparser.ConfigParser()
config_ini_dict.read(os.path.join(APP_DIR, "config.ini"))
logger.info(config_ini_dict)

if __name__ == '__main__':
    server = grpc.server(
        concurrent.futures.ThreadPoolExecutor(
            max_workers=multiprocessing.cpu_count() * 5))

    side_feature_pb2_grpc.add_side_feature_serviceServicer_to_server(
        SideFeatureService(), server)

    server.add_insecure_port("[::]:{}".format(
        config_ini_dict["SERVER"]["PORT"]))
    server.start()
    logger.info("server start...")
    try:
        while True:
def handler(event, context):
    logger.info("Request Event: {}".format(event))
    try:
        # Default empty response
        response = dict()
        if 'body' in event:
            request_body_json = event['body']
            logger.info('Received API Gateway Request with Body: {}'.format(
                request_body_json))
            if 'challenge' in request_body_json:
                # For verification by Slack
                challenge = request_body_json["challenge"]
                logger.info('Challenge: {}'.format(challenge))
                challenge_response = dict()
                challenge_response['challenge'] = challenge
                response = {
                    'status_code': 200,
                    'body': json.dumps(challenge_response)
                }

            if 'event' in request_body_json:
                slack_event = request_body_json['event']
                logger.info(
                    'Received Slack Event with Body: {}'.format(slack_event))
                if 'bot_id' in slack_event:
                    # Capture Bot event if needed
                    # Ignore Bot event if not needed
                    logger.warn('Ignored bot event')
                else:
                    # Capture User event
                    # Get the text of the user message sent to the bot
                    user_message = slack_event['text']
                    logger.info('User Message: {}'.format(user_message))

                    # Create your Bot Reply logic here
                    # For now - this is a hardcoded reply
                    """In ideal cases - configure a NLP service like Watson Assistant or Rasa NLU
                    to respond to a natural languages user text"""
                    bot_reply = "Hello I am the Serverless Slack Bot"

                    # Get the ID of the channel where the message was posted.
                    channel_id = slack_event["channel"]

                    if len(user_message) > 0:
                        # Create an associative array and URL-encode it
                        # The Slack API doesn't not handle JSON
                        data = urllib.parse.urlencode(
                            (("token", BOT_TOKEN), ("channel", channel_id),
                             ("text", bot_reply)))
                        data = data.encode("ascii")

                        # Construct the HTTP request that will be sent to the Slack API.
                        request = urllib.request.Request(SLACK_URL,
                                                         data=data,
                                                         method="POST")

                        # Add a header mentioning that the text is URL-encoded.
                        request.add_header(
                            "Content-Type",
                            "application/x-www-form-urlencoded")

                        # Fire the request
                        urllib.request.urlopen(request).read()

                        # Success
                        response = {
                            'status_code': 200,
                        }
        logger.info("Response: {}".format(response))
        return response
    except Exception as e:
        # Error
        logger.error(e)