def test_basic(): client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME) sentMsg = {'test':'demo'} client.sendMessage(sentMsg) client.sleep(10) receivedMsg = client.getMessage() assert sentMsg == receivedMsg print 'test_basic passed!'
# RabbitMQ config ### REPLACE CLOUD_AMQP_URL WITH YOUR OWN ### CLOUD_AMQP_URL = '''''' DATA_FETCHER_QUEUE_NAME = 'dataFetcherTaskQueue' # mongodb config PROPERTY_TABLE_NAME = 'property' FETCH_SIMILAR_PROPERTIES = True SECONDS_IN_ONE_DAY = 3600 * 24 SECONDS_IN_ONE_WEEK = SECONDS_IN_ONE_DAY * 7 WAITING_TIME = 3 cloudAMQP_client = CloudAMQPClient(CLOUD_AMQP_URL, DATA_FETCHER_QUEUE_NAME) def handle_message(msg): task = json.loads(msg) if (not isinstance(task, dict) or not 'zpid' in task or task['zpid'] is None): return zpid = task['zpid'] # Scrape the zillow for details property_detail = zillow_web_scraper_client.get_property_by_zpid(zpid) # Add timestamp property_detail['last_update'] = time.time()
from cloudAMQP_client import CloudAMQPClient CLOUDAMQP_URL = 'amqp://*****:*****@hyena.rmq.cloudamqp.com/kdflangt' QUEUE_NAME = 'dataFetcherTaskQueue' # init a client client = CloudAMQPClient(CLOUDAMQP_URL, QUEUE_NAME) # send a message client.sendDataFetcherTask({'zpid': '83154148'}) # receive a message # client.getDataFetcherTask()
import operations import os import sys from sets import Set # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client CLICK_LOGS_TABLE_NAME = "click_logs" from cloudAMQP_client import CloudAMQPClient LOG_CLICKS_TASK_QUEUE_URL = "localhost" LOG_CLICKS_TASK_QUEUE_NAME = "tap-news-log-clicks-task-queue" cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME) # Start Redis and MongoDB before running following tests. def test_getNewsSummariesForUser_basic(): news = operations.getNewsSummariesForUser('test', 1) print news assert len(news) > 0 print 'test_getNewsSummariesForUser_basic passed!' def test_getNewsSummariesForUser_pagination(): news_page_1 = operations.getNewsSummariesForUser('test', 1) news_page_2 = operations.getNewsSummariesForUser('test', 2)
import mongodb_client import news_topic_modeling_service_client from cloudAMQP_client import CloudAMQPClient # TODO: use your own queue. DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@gull.rmq.cloudamqp.com/gxaifheg" DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue" SLEEP_TIME_IN_SECONDS = 1 # change the table that store the news NEWS_TABLE_NAME = "news" SAME_NEWS_SIMILARITY_THRESHOLD = 0.9 cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict) : return task = msg text = task['text'] if text is None: return # Get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
import os import sys import news_deduper as deduper # import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) from cloudAMQP_client import CloudAMQPClient DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@otter.rmq.cloudamqp.com/xemwnewz" DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue" cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) TEST_MSG1 = "" TEST_MSG2 = cloudAMQP_client.get_message() def test_basic(): deduper.handle_message(TEST_MSG1) deduper.handle_message(TEST_MSG2) print('test_basic passed!') if __name__ == "__main__": test_basic()
'config/databases.yaml') CLOUDAMQP_CONFIG_FILE = os.path.join(os.path.dirname(__file__), '..', 'config/cloudAMQP.yaml') with open(DB_CONFIG_FILE, 'r') as dbCfg: db_config = yaml.load(dbCfg) with open(CLOUDAMQP_CONFIG_FILE, 'r') as amqpCfg: cloudAMQP_config = yaml.load(amqpCfg) with open(NEWS_CONFIG_FILE, 'r') as newsCfg: news_config = yaml.load(newsCfg) NEWS_TABLE_NAME = db_config['mongodb']['write_news_table'] dedupe_news_queue_client = CloudAMQPClient( cloudAMQP_config['url'], cloudAMQP_config['dedupe_queue_name']) def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'].encode('utf-8', 'ignore') if text == '': print 'Message has not text!' return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year,
#import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) #sys.path.append(os.path.join(os.path.dirname(__file__),'./','scrapers')) import log_client #import cnn_news_scraper from cloudAMQP_client import CloudAMQPClient LOG_GRAPHITE_TASK_QUEUE_URL = config['operations'][ 'LOG_GRAPHITE_TASK_QUEUE_URL'] LOG_GRAPHITE_TASK_QUEUE_NAME = config['operations'][ 'LOG_GRAPHITE_TASK_QUEUE_NAME'] SLEEP_TIME_IN_SECONDS = config['operations']['SLEEP_TIME_IN_SECONDS'] graphitelog_cloudAMQP_client = CloudAMQPClient(LOG_GRAPHITE_TASK_QUEUE_URL, LOG_GRAPHITE_TASK_QUEUE_NAME) def handle_message(msg): if msg is None: log_client.logger.info('message is broken') #print 'message is broken' return counter = statsd.Counter(msg) counter += 1 while True: #fetch message from queue if graphitelog_cloudAMQP_client is not None:
'the-washington-post', 'the-wall-street-journal', 'usa-today' ] # redis params REDIS_HOST = 'localhost' REDIS_PORT = 6379 NEWS_TIMEOUT_IN_SECONDS = 3600 * 24 * 3 redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) # cloudAMQP params CLOUDAMQP_SLEEP_TIMEOUT_IN_SECONDS = 10 CLOUDAMQP_NEWS_SCRAPER_QUEUE_URL = 'amqp://*****:*****@crane.rmq.cloudamqp.com/ceunvrpr' CLOUDAMQP_NEWS_SCRAPER_QUEUE_NAME = 'tap-news-scrape-news-task-queue' cloudAMQP_client = CloudAMQPClient(CLOUDAMQP_NEWS_SCRAPER_QUEUE_URL, CLOUDAMQP_NEWS_SCRAPER_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSources(NEWS_SOURCES) num_of_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: news['digest'] = news_digest num_of_news += 1 if news['publishedAt'] is None: news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.append(os.path.join(os.path.dirname(__file__), 'scrapers')) import cnn_news_scraper from cloudAMQP_client import CloudAMQPClient # Use your own Cloud AMQP queue DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/ztdiqmcn" DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue" SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/ztdiqmcn" SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue" SLEEP_TIME_IN_SECONDS = 5 dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'message is broken' return task = msg article = Article(task['url']) article.download() article.parse() print article.text task['text'] = article.text dedupe_news_queue_client.sendMessage(task)
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 # news expires in one day SLEEP_TIME_IN_SECOUNDS = 10 # every 10 seconds for every loop SCRAPE_NEWS_TASK_QUEUE_URL = 'amqps://*****:*****@chimpanzee.rmq.cloudamqp.com/lkscpwqu' SCRAPE_NEWS_TASK_QUEUE_NAME = 'feeder-scrape-news-task-queue' NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' "the-verge" ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: # news_digest = base64.b64encode(hashlib.md5(news['title'].encode('utf-8')).digest()) # news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest() # print type(news_digest) if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest # every news has unique digest # use utc time to avoid different time zones if news['publishedAt'] is None:
REDIS_HOST = config['redis']['HOST'] REDIS_PORT = config['redis']['PORT'] NEWS_LIST_BATCH_SIZE = config['backend_server'][ 'NEWS_LIST_BATCH_SIZE'] # number of news in single page NEWS_LIMIT = config['backend_server'][ 'NEWS_LIMIT'] # maximum number of news of one fetch from mongoDB USER_NEWS_TIME_OUT_IN_SECONDS = config['backend_server'][ 'USER_NEWS_TIME_OUT_IN_SECONDS'] # timeout for user's pagination info in redis CLICK_LOG_TASK_QUEUE_URL = config['cloudAMQP']['CLICK_LOG_TASK_QUEUE_URL'] CLICK_LOG_TASK_QUEUE_NAME = config['cloudAMQP']['CLICK_LOG_TASK_QUEUE_NAME'] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0) cloudAMQP_client = CloudAMQPClient(CLICK_LOG_TASK_QUEUE_URL, CLICK_LOG_TASK_QUEUE_NAME) def getOneNews(): db = mongodb_client.get_db() news = db[NEWS_TABLE_NAME].find_one() return news def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) # news range to be fetched for the page number begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE end_index = page_num * NEWS_LIST_BATCH_SIZE
import sys from newspaper import Article sys.path.append(os.path.join(os.path.dirname(__file__), '../backend_server/', 'utils')) from cloudAMQP_client import CloudAMQPClient DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/thclcviw" DEDUPE_NEWS_TASK_QUEUE_NAME = "top-new-DEDUPE_NEWS_TASK_QUEUE_NAME" SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/qfgxyvvk" SCRAPE_NEWS_TASK_QUEUE_NAME = "top-news-SCRAPE_NEWS_TASK_QUEUE" SLEEP_TIME_IN_SECONDS = 5 scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is broken') return task = msg text = None article = Article(task['url']) article.download() article.parse() task['text'] = article.text dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)
NEWS_LIST_BATCH_SIZE = 10 USER_NEWS_TIME_OUT_IN_SECONDS = 60 LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@crane.rmq.cloudamqp.com/crxnfwlj" LOG_CLICKS_TASK_QUEUE_NAME = "clickLog" """ with open('../configuration/backend_conf.yaml', 'r') as stream: try: config = yaml.load(stream) except yaml.YAMLError as error6: print error6 redis_client = redis.StrictRedis(config['operations']['REDIS_HOST'], int(config['operations']['REDIS_PORT']), db=0) cloudAMQP_client = CloudAMQPClient(config['operations']['LOG_CLICKS_TASK_QUEUE_URL'], config['operations']['LOG_CLICKS_TASK_QUEUE_NAME']) def getNewsSummariesForUser(user_id, page_num): page_num = int(page_num) begin_index = (page_num - 1) * config['operations']['NEWS_LIST_BATCH_SIZE'] end_index = page_num * int(config['operations']['NEWS_LIST_BATCH_SIZE']) # The final list of news to be returned. sliced_news = [] if redis_client.get(user_id) is not None: news_digests = pickle.loads(redis_client.get(user_id)) # If begin_index is out of range, this will return empty list; # If end_index is out of range (begin_index is within the range), this # will return all remaining news ids.
from newspaper import Article sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) from cloudAMQP_client import CloudAMQPClient with open('../config.json') as json_data_file: config = json.load(json_data_file) SCRAPE_TASK_QUEUE_URL = config['cloudAMQP']['scraperTaskQueue']['url'] SCRAPE_TASK_QUEUE_NAME = config['cloudAMQP']['scraperTaskQueue']['name'] DEDUPE_TASK_QUEUE_URL = config['cloudAMQP']['deduperTaskQueue']['url'] DEDUPE_TASK_QUEUE_NAME = config['cloudAMQP']['deduperTaskQueue']['name'] SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['sleep'] scrape_task_mq_client = CloudAMQPClient(SCRAPE_TASK_QUEUE_URL, SCRAPE_TASK_QUEUE_NAME) dedupe_task_mq_client = CloudAMQPClient(DEDUPE_TASK_QUEUE_URL, DEDUPE_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is invalid') return task = msg article = Article(task['url']) article.download() article.parse() task['text'] = article.text dedupe_task_mq_client.sendMessage(task)
from cloudAMQP_client import CloudAMQPClient DEDUPE_NEWS_TASK_QUEUE_URL = config['news_pipeline']['news_deduper'][ 'DEDUPE_NEWS_TASK_QUEUE_URL'] DEDUPE_NEWS_TASK_QUEUE_NAME = config['news_pipeline']['news_deduper'][ 'DEDUPE_NEWS_TASK_QUEUE_NAME'] SLEEP_TIME_IN_SECONDS = config['news_pipeline']['news_deduper'][ 'SLEEP_TIME_IN_SECONDS'] SAME_NEWS_SIMILARITY_THRESHOLD = config['news_pipeline']['news_deduper'][ 'SAME_NEWS_SIMILARITY_THRESHOLD'] NEWS_TABLE_NAME = config['news_pipeline']['news_deduper']['NEWS_TABLE_NAME'] cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = str(task['text']) if text is None: return # Get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
# import common package in parent directory sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import news_topic_modeling_service_client from cloudAMQP_client import CloudAMQPClient with open('../configuration/news_pipeline_conf.yaml', 'r') as stream: try: config = yaml.load(stream) except yaml.YAMLError as exc: print(exc) cloudAMQP_client = CloudAMQPClient( config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_URL'], config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_NAME']) def handle_message(msg): if msg is None or not isinstance(msg, dict): return task = msg text = task['text'] if text is None: return ''' get news from database with similar time ''' published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
from cloudAMQP_client import CloudAMQPClient REDIS_HOST = 'localhost' REDIS_PORT = 6379 NEWS_TABLE_NAME = '[top-news]' # table name in MongoDb PAGE_SIZE = 10 NEWS_LIMIT = 100 USER_NEWS_TIME_OUT_IN_SECONDS = 60 # timeout for user's pagination info in Redis redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0) USER_CLICK_LOG_QUEUE_URL = 'amqp://*****:*****@otter.rmq.cloudamqp.com/xhzhqriu' USER_CLICK_LOG_QUEUE_NAME = 'UserClickLogQueue' cloudAMQP_client = CloudAMQPClient(USER_CLICK_LOG_QUEUE_URL, USER_CLICK_LOG_QUEUE_NAME) def getOneNews(): db = mongodb_client.get_db() news = db[NEWS_TABLE_NAME].find_one() # bson.json_util.dumps transform bson to serialized string # unserialize string and return json object return json.loads(dumps(news)) def getNewsCount(): db = mongodb_client.get_db() count = db[NEWS_TABLE_NAME].count() return count
import cnn_news_scraper #pylint: disable=import-error, wrong-import-position SCRAPER_QUEUE_URL = "" SCRAPER_QUEUE_NAME = "news-scraper-queue" DEDUPER_QUEUE_URL = "" DEDUPER_QUEUE_NAME = "news-deduper-queue" SLEEP_TIME_IN_SECONDS = 5 logger_format = '%(asctime)s - %(message)s' logging.basicConfig(format=logger_format) logger = logging.getLogger('news_fetcher') logger.setLevel(logging.DEBUG) scrape_queue_client = CloudAMQPClient(SCRAPER_QUEUE_URL, SCRAPER_QUEUE_NAME) dedupe_queue_client = CloudAMQPClient(DEDUPER_QUEUE_URL, DEDUPER_QUEUE_NAME) def handle_message_old(msg): # if the msg is not json format if not isinstance(msg, dict): logger.warning('message is broken') return text = None if msg['source'] == 'cnn': text = cnn_news_scraper.extract_news(msg['url']) if text is not None and len(text) > 0: msg['text'] = text dedupe_queue_client.sendMessage(msg)
DEDUPER_QUEUE_URL = "" DEDUPER_QUEUE_NAME = "news-deduper-queue" SLEEP_TIME_IN_SECONDS = 1 NEWS_TABLE_NAME = "news_list" SAME_NEWS_SIMILARITY_THRESHOLD = 0.82 logger_format = '%(asctime)s - %(message)s' logging.basicConfig(format=logger_format) logger = logging.getLogger('news_deduper') logger.setLevel(logging.DEBUG) cloudAMQP_client = CloudAMQPClient(DEDUPER_QUEUE_URL, DEDUPER_QUEUE_NAME) stemmer = PorterStemmer() translator = str.maketrans('','',string.punctuation) def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems
stream = open("../config.yml", "r") load = yaml.load(stream) config = load['default']['common'] DEDUPE_NEWS_TASK_QUEUE_URL = config['cloudAMQP']['DEDUPE_NEWS_TASK_QUEUE_URL'] DEDUPE_NEWS_TASK_QUEUE_NAME = config['cloudAMQP'][ 'DEDUPE_NEWS_TASK_QUEUE_NAME'] SCRAPE_NEWS_TASK_QUEUE_URL = config['cloudAMQP']['SCRAPE_NEWS_TASK_QUEUE_URL'] SCRAPE_NEWS_TASK_QUEUE_NAME = config['cloudAMQP'][ 'SCRAPE_NEWS_TASK_QUEUE_NAME'] SLEEP_IN_SECOND = config['cloudAMQP']['FETCHER_SLEEP_IN_SECOND'] dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) scrap_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'msg is broken' return task = msg # text = None # We support CNN only now # if task['source'] == 'cnn': # print 'Scrapping CNN news' # text = scraper.cnn_news_scraper.extract_news(task['url'])
from newspaper import Article sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) sys.path.append(os.path.join(os.path.dirname(__file__), 'scrapers')) import cnnNewsScraper from cloudAMQP_client import CloudAMQPClient SCRAPE_NEWS_TASK_CLOUDAMQP_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/fauunoyn" SCRAPE_NEWS_TASK_CLOUDAMQP_NAME = "news-scrape-task-queue" DEDUPE_NEWS_TASK_CLOUDAMQP_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/fauunoyn" DEDUPE_NEWS_TASK_CLOUDAMQP_NAME = "news-dedupe-task-queue" SLEEP_TIME_IN_SECONDS = 5 dedupe_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_CLOUDAMQP_URL, DEDUPE_NEWS_TASK_CLOUDAMQP_NAME) scrape_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_CLOUDAMQP_URL, SCRAPE_NEWS_TASK_CLOUDAMQP_NAME) def handle_message(msg): global dedupe_queue_client if msg is None or not isinstance(msg, dict): print 'message is broken' return task = msg article = Article(task['url']) article.download()
from cloudAMQP_client import CloudAMQPClient CLOUDAMQP_URL = 'amqp://*****:*****@hyena.rmq.cloudamqp.com/jnqrsjwd' QUEUE_NAME = 'dataFetcherTaskQueue' # Initialize a client client = CloudAMQPClient(CLOUDAMQP_URL, QUEUE_NAME) # Send a message client.sendDataFetcherTask({'name': 'test message'}) # Receive a message # client.getDataFetcherTask()
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import news_api_client from cloudAMQP_client import CloudAMQPClient with open('../configuration/news_pipeline_conf.yaml', 'r') as stream: try: config = yaml.load(stream) except yaml.YAMLError as exc: print(exc) redis_client = redis.StrictRedis(config['news_monitor']['REDIS_HOST'], int(config['news_monitor']['REDIS_PORT'])) cloudAMQP_client = CloudAMQPClient( config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_URL'], config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_NAME']) while True: news_list = news_api_client.getNewsFromSource( config['news_monitor']['NEWS_SOURCES']) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode( 'base64') # digest can be used as a unique ID if redis_client.get(news_digest) is None: '''new news coming in'''
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3 REDIS_HOST = 'localhost' REDIS_PORT = 6379 SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@otter.rmq.cloudamqp.com/cwvictdm" SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue" NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) def run(): while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).hexdigest() if redis_client.get(news_digest) is None: num_of_news_news = num_of_news_news + 1 news['digest'] = news_digest
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client from cloudAMQP_client import CloudAMQPClient # Use your own Cloud AMQP queue DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/ztdiqmcn" DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue" SLEEP_TIME_IN_SECONDS = 1 NEWS_TABLE_NAME = "news" SAME_NEWS_SIMILARITY_THRESHOLD = 0.8 cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict) : return task = msg text = str(task['text']) if text is None: return # get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0) published_at_day_end = published_at_day_begin + datetime.timedelta(days=1) db = mongodb_client.get_db()
from cloudAMQP_client import CloudAMQPClient # Don't modify this value unless you know what you are doing. NUM_OF_CLASSES = 17 INITIAL_P = 1.0 / NUM_OF_CLASSES ALPHA = 0.1 SLEEP_TIME_IN_SECONDS = 1 LOG_CLICKS_TASK_QUEUE_URL = 'amqp://*****:*****@donkey.rmq.cloudamqp.com/zylpcqxg' LOG_CLICKS_TASK_QUEUE_NAME = 'tap-news-log-clicks-task-queue' PREFERENCE_MODEL_TABLE_NAME = 'user_preference_model' NEWS_TABLE_NAME = 'news-test' cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict): return if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg): return userId = msg['userId'] newsId = msg['newsId'] # Update user's preference db = mongodb_client.get_db() model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})
# 1st AMQP, stores news digest and url to scrape news body from SCRAPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@otter.rmq.cloudamqp.com/xhzhqriu' SCRAPE_NEWS_TASK_QUEUE_NAME = 'TopNewsTitleQueue' # 2nd AMQP, stores news body for news deduper to consume DEDUPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@otter.rmq.cloudamqp.com/xhzhqriu' DEDUPE_NEWS_TASK_QUEUE_NAME = 'TopNewsQueue' # interval between two processes SLEEP_TIME_IN_SECONDS = 5 # put both queues to sleep in turn for a certain times # total pause time (when no msg in Q) 2 * 45 * 10 = 900 PAUSE_INTERVAL_IN_EACH_LOOP = 45 LOOPS = 10 dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) # process msg: attach text field (news body), then push it to next AMQP def handle_message(msg): if msg is None or not isinstance(msg, dict): print('message is broken') return task = msg article = Article(task['url']) article.download() article.parse()
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import news_api_client from cloudAMQP_client import CloudAMQPClient NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3 SLEEP_TIME_IN_SECONDS = 10 REDIS_HOST = 'localhost' REDIS_PORT = 6379 SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@crane.rmq.cloudamqp.com/mcyrgohw" SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue" redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_news_news = 0 for news in news_list: news_digest = hashlib.md5( news['title'].encode('utf-8')).digest().encode('base64')
import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), 'common')) import time from cloudAMQP_client import CloudAMQPClient # pylint: disable=E0401 from dotenv import load_dotenv # pylint: disable=E0401 dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env') load_dotenv(dotenv_path) MQ_CAMP_FETCHER_TASK_NAME = os.environ.get("MQ_CAMP_FETCHER_TASK_NAME") MQ_CAMP_FETCHER_TASK_URI = os.environ.get("MQ_CAMP_FETCHER_TASK_URI") fetch_camp_client = CloudAMQPClient( MQ_CAMP_FETCHER_TASK_URI, MQ_CAMP_FETCHER_TASK_NAME) SLEEP_TIME_IN_SECONDS = 1 ROOT_URL = "http://find.acacamps.org/camp_profile.php?camp_id=" def build_camp_url(index): camp_url = '%s%s' % (ROOT_URL, str(index)) return camp_url def run(): ''' loop camps url list from a index page and fetch each individual camp's URL as task to Message Queue ''' for index in range(1, 4869): camp_url = build_camp_url(index) task = {'url': camp_url, 'camp_id': index}
# prepare logging import logging logging.basicConfig(filename='./logging/backend_server.log', level=logging.INFO) # ask server configs server_config = config_service_client.getServerConfigForServer( 'backend_server') server_host = server_config['url'] server_port = int(server_config['port']) # ask mq configs mq_config = config_service_client.getMessagequeueConfigForUsecase( 'log_clicks_task') cloudAMQP_client = CloudAMQPClient(mq_config['queue_url'], mq_config['queue_name']) class RequestHandler(pyjsonrpc.HttpRequestHandler): @pyjsonrpc.rpcmethod def getNewsSummariesForUser(self, user_id, page_num): logging.info('backend_server: %s asks for page %s' % (user_id, page_num)) return operations.getNewsSummariesForUser(user_id, page_num) @pyjsonrpc.rpcmethod def logNewsClickForUser(self, user_id, news_id, rate): logging.info('backend_server: %s rates %s for news: %s' % (user_id, rate, news_id)) return operations.logNewsClickForUser(user_id, news_id, rate)
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client import zillow_web_scraper_client from cloudAMQP_client import CloudAMQPClient # RabbitMQ config CLOUD_AMQP_URL = '''amqp://*****:*****@hyena.rmq.cloudamqp.com/kdflangt''' DATA_FETCHER_QUEUE_NAME = 'dataFetcherTaskQueue' ZIPCODE_FILE = 'san_diego_zipcode_list.txt' SHUFFLE_ZIPCODES = True WAITING_TIME = 3 cloudAMQP_client = CloudAMQPClient(CLOUD_AMQP_URL, DATA_FETCHER_QUEUE_NAME) zipcode_list = [] with open(ZIPCODE_FILE, 'r') as zipcode_file: for zipcode in zipcode_file: zipcode_list.append(str(zipcode)) if SHUFFLE_ZIPCODES: print "shuffle zipcodes!" random.shuffle(zipcode_list) for zipcode in zipcode_list: zpids = zillow_web_scraper_client.get_zpid_by_zipcode(zipcode) time.sleep(WAITING_TIME)
import os import sys from newspaper import Article sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import config_service_client from cloudAMQP_client import CloudAMQPClient # prepare logging import logging logging.basicConfig(filename='./logging/news_pipeline.log', level=logging.INFO) # ask for mq configs scrape_mq_config = config_service_client.getMessagequeueConfigForUsecase('scrape_news_task') dedupe_mq_config = config_service_client.getMessagequeueConfigForUsecase('dedupe_news_task') scrape_news_queue_client = CloudAMQPClient(scrape_mq_config['queue_url'], scrape_mq_config['queue_name']) dedupe_news_queue_client = CloudAMQPClient(dedupe_mq_config['queue_url'], dedupe_mq_config['queue_name']) # ask for other configs fetch_config = config_service_client.getPipelineConfigForSection('news_fetcher') scrape_sleeptime_seconds = int(fetch_config['scrape_queue_client_sleeptime_seconds']) dedupe_sleeptime_seconds = int(fetch_config['dedupe_queue_client_sleeptime_seconds']) def handle_message(msg): if msg is None or not isinstance(msg, dict): print 'message from news_to_scrape is broken' logging.error('news_fetcher: message from news_to_scrape is broken') return # use Newspaper to scrape the text of news task = msg text = None
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import mongodb_client from cloudAMQP_client import CloudAMQPClient DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@otter.rmq.cloudamqp.com/ncbqvnsd" DEDUPE_NEWS_TASK_QUEUE_NAME = "pin_news_deduper" SLEEP_TIME_IN_SECONDS = 1 SAME_NEWS_SIMILARITY_THRESHOLD = 0.9 NEWS_TABLE_NAME = "pin_news" dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME) def handle_message(msg): if msg is None or not isinstance(msg, dict) : return task = msg content = task['content'] if content is None: return # Get all recent news based on publishedAt published_at = parser.parse(task['publishedAt']) published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day - 1, 0, 0, 0, 0)
NEWS_SOURCES = [ 'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly', 'espn', 'ign', 'techcrunch', 'the-new-york-times', 'the-wall-street-journal', 'the-washington-post' ] redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT) cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME) while True: news_list = news_api_client.getNewsFromSource(NEWS_SOURCES) num_of_new_news = 0 for news in news_list: news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64') if redis_client.get(news_digest) is None: num_of_new_news = num_of_new_news + 1 news['digest'] = news_digest if news['publishedAt'] is None: # format: YYYY-MM-DDTHH:MM:SS in UTC news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
import sys import redis import hashlib import datetime sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) import news_api_client import config_service_client from cloudAMQP_client import CloudAMQPClient # prepare logging import logging logging.basicConfig(filename='./logging/news_pipeline.log', level=logging.INFO) # ask for mq and memory configs mq_config = config_service_client.getMessagequeueConfigForUsecase('scrape_news_task') cloudAMQP_client = CloudAMQPClient(mq_config['queue_url'], mq_config['queue_name']) mmr_config = config_service_client.getMemoryConfig('redis') redis_client = redis.StrictRedis(mmr_config['host'], mmr_config['port']) # ask for other params news_monitor_config = config_service_client.getPipelineConfigForSection('news_monitor') news_sources = news_monitor_config['news_sources'] news_timeout_seconds = int(news_monitor_config['news_timeout_seconds']) sleeptime_seconds = int(news_monitor_config['scrape_queue_client_sleeptime_seconds']) while True: # such a step takes a list of latest news task, but most of them could be old duplicates news_list = news_api_client.getNewsFromSource(news_sources) num_of_new_news = 0