def process_item(self, item): if item in self.bf: logger.info('[%s] is already in bloom.' % item) return None else: print('add one') self.bf.add(item) self.bf.tofile(open(self.bloom_path, 'wb')) return item
def crawl(search_word, page_num): for i in range(1, page_num + 1): try: list_url = 'https://www.zimeika.com/video/lists/haokan.html?cate_id=&time_type=&read_order=&type=&author_id=&author_name=&title=%s&p=%d' % ( search_word, i) print('list_url = %s' % list_url) parse_list(list_url) except Exception as e: logger.info(e) continue
def parse_list(list_url): html = requests.get(list_url, headers=headers).text text = etree.HTML(html) detail_urls = text.xpath("//ul[@class='video-list']/li/a/@href") for url in detail_urls: try: detail_url = 'https://www.zimeika.com' + url parse_detail(detail_url) except Exception as e: logger.info(e) continue
def parse_list_with_selenium(list_url): driver = webdriver.PhantomJS() driver.get(list_url) details = driver.find_elements_by_xpath("//ul[@class='video-list']/li/a") # driver.close() for detail in details: try: detail_url = detail.get_attribute('href') parse_detail(detail_url) except Exception as e: logger.info(e) continue
def parse_detail(detail_url): bfu = MyBloomUtil('zimeika_crawler') processed_detail_url = bfu.process_item(detail_url) if not processed_detail_url: logger.info('%s has already been crawled.' % detail_url) return html = requests.get(processed_detail_url, headers=headers).text text = etree.HTML(html) title = text.xpath("//h1[@class='article-title']/a/text()")[0] video_url = text.xpath( "//article/div[@class='content-text']/div[@class='thumbnail']/a/@href" )[0] print('title = %s' % title) print('video_url = %s' % video_url) # download_video(title, video_url) save_info(title, video_url)
import multiprocessing import configparser import grpc APP_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(APP_DIR) sys.path.append(os.path.join(APP_DIR, "grpc_file")) from utils.log_util import logger from grpc_file import side_feature_pb2_grpc from service.side_feature_service import SideFeatureService config_ini_dict = configparser.ConfigParser() config_ini_dict.read(os.path.join(APP_DIR, "config.ini")) logger.info(config_ini_dict) if __name__ == '__main__': server = grpc.server( concurrent.futures.ThreadPoolExecutor( max_workers=multiprocessing.cpu_count() * 5)) side_feature_pb2_grpc.add_side_feature_serviceServicer_to_server( SideFeatureService(), server) server.add_insecure_port("[::]:{}".format( config_ini_dict["SERVER"]["PORT"])) server.start() logger.info("server start...") try: while True:
def handler(event, context): logger.info("Request Event: {}".format(event)) try: # Default empty response response = dict() if 'body' in event: request_body_json = event['body'] logger.info('Received API Gateway Request with Body: {}'.format( request_body_json)) if 'challenge' in request_body_json: # For verification by Slack challenge = request_body_json["challenge"] logger.info('Challenge: {}'.format(challenge)) challenge_response = dict() challenge_response['challenge'] = challenge response = { 'status_code': 200, 'body': json.dumps(challenge_response) } if 'event' in request_body_json: slack_event = request_body_json['event'] logger.info( 'Received Slack Event with Body: {}'.format(slack_event)) if 'bot_id' in slack_event: # Capture Bot event if needed # Ignore Bot event if not needed logger.warn('Ignored bot event') else: # Capture User event # Get the text of the user message sent to the bot user_message = slack_event['text'] logger.info('User Message: {}'.format(user_message)) # Create your Bot Reply logic here # For now - this is a hardcoded reply """In ideal cases - configure a NLP service like Watson Assistant or Rasa NLU to respond to a natural languages user text""" bot_reply = "Hello I am the Serverless Slack Bot" # Get the ID of the channel where the message was posted. channel_id = slack_event["channel"] if len(user_message) > 0: # Create an associative array and URL-encode it # The Slack API doesn't not handle JSON data = urllib.parse.urlencode( (("token", BOT_TOKEN), ("channel", channel_id), ("text", bot_reply))) data = data.encode("ascii") # Construct the HTTP request that will be sent to the Slack API. request = urllib.request.Request(SLACK_URL, data=data, method="POST") # Add a header mentioning that the text is URL-encoded. request.add_header( "Content-Type", "application/x-www-form-urlencoded") # Fire the request urllib.request.urlopen(request).read() # Success response = { 'status_code': 200, } logger.info("Response: {}".format(response)) return response except Exception as e: # Error logger.error(e)