def clearQueue(queue_url, queue_name):
    scrape_news_queue_client = CloudAMQPClient(queue_url, queue_name)

    num_of_messages = 0

    while True:
        if scrape_news_queue_client is not None:
            msg = scrape_news_queue_client.getMessage()
            if msg is None:
                print("Cleared %d messages." % num_of_messages)
                return
            num_of_messages += 1
def test_basic():
  client = CloudAMQPClient(TEST_CLOUDAMQP_URL, TEST_QUEUE_NAME)

  sentMsg = {'test':'test'}
  client.sendMessage(sentMsg)

  client.sleep(5)

  receivedMsg = client.getMessage()
  assert sentMsg == receivedMsg

  print('test_basic passed!')
Ejemplo n.º 3
0
def clearQueue(queue_url, queue_name):
    queue_client = CloudAMQPClient(queue_url, queue_name)

    num_of_messages = 0

    while True:
        if queue_client is not None:
            msg = queue_client.getMessage()
            if msg is None:
                print("Cleared %d messages in queue %s." %
                      (num_of_messages, queue_name))
                return
            num_of_messages += 1
Ejemplo n.º 4
0
def test_basic():
    """ test unit """
    client = CloudAMQPClient(TEST_CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sent_msg = {'test': 'test'}
    client.send_message(sent_msg)

    client.sleep(5)

    receive_msg = client.get_message()
    assert sent_msg == receive_msg

    print('test_basic passed!')
Ejemplo n.º 5
0
def logNewsClickForUser(user_id, news_id):

    LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/evvloemh"
    LOG_CLICKS_TASK_QUEUE_NAME = "LOG_CLICKS_TASK_QUEUE"
    cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL, LOG_CLICKS_TASK_QUEUE_NAME)
    message = {'userId': user_id, 'newsId': news_id, 'timestamp': datetime.utcnow()}

    db = mongodb_client.get_db()
    db[CLICK_LOGS_TABLE_NAME].insert(message)
    # Send log task to machine learning service for prediction
    message = {'userId': user_id, 'newsId': news_id, 'timestamp': str(datetime.utcnow())}

    cloudAMQP_client.sendMessage(message);
Ejemplo n.º 6
0
def test_logNewsClickForUser():
    cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                       LOG_CLICKS_TASK_QUEUE_NAME)

    operations.log_news_click_for_user('test_user', 'test_news')

    cloudAMQP_client.sleep(3)
    receivedMsg = cloudAMQP_client.getMessage()

    assert receivedMsg['userId'] == 'test_user'
    assert receivedMsg['newsId'] == 'test_news'

    print('test_logNewsClickForUser passed!')
Ejemplo n.º 7
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is broken')
        return

    task = msg
    text = None

    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)
    dedupe_news_queue_client.sendMessage(task)
Ejemplo n.º 8
0
def test_basic():
    client = CloudAMQPClient(DEDUP_CLOUDAMQP_URL, DEDUP_QUEUE_NAME)

    sentMsg = {'test': 'test'}
    # try:
    #     client.send_message(sentMsg)
    # except Exception as e:
    #     print "send message wrong"
    receivedMSG = client.receive_message()

    print receivedMSG

    assert sentMsg == receivedMSG
    print "test_basic passed"
Ejemplo n.º 9
0
    def __call__(self):
        self.dedupe_news_queue_client = CloudAMQPClient(
            self.dedupe_news_task_queue_url, self.dedupe_news_task_queue_name)
        self.scrape_news_queue_client = CloudAMQPClient(
            self.scrape_news_task_queue_url, self.scrape_news_task_queue_name)

        #fetch msg from queue
        if self.scrape_news_queue_client is not None:
            while True:
                msg = self.scrape_news_queue_client.getMessage()
                if msg is not None:
                    #handle message
                    try:
                        self.handle_message(msg)
                    except Exception as e:
                        print e
                        pass
                    self.scrape_news_queue_client.sleep(
                        self.sleep_time_in_seconds)
                else:
                    self.scrape_news_queue_client.close()
                    self.dedupe_news_queue_client.close()
                    break
Ejemplo n.º 10
0
def logCoursesClick(userId, courseId):
    print "click received"
    db = mongodb_client.get_db()
    message = {
        'userId': userId,
        'courseId': courseId,
        'timestamp': datetime.utcnow()
    }
    db['clicklog'].insert(message)
    cloudAMQP_client = CloudAMQPClient(CLICK_TASK_QUEUE_URL,
                                       CLICK_TASK_QUEUE_NAME)
    message = {
        'userId': userId,
        'newsId': courseId,
        'timestamp': str(datetime.utcnow())
    }
    cloudAMQP_client.sendMessage(message)
Ejemplo n.º 11
0
    def __init__(self):
        with open('../config/config.json', 'r') as f:
            data = json.load(f)
            LOG_CLICKS_TASK_QUEUE_URL = data['queue']['logClicksTaskQueueUrl']
            LOG_CLICKS_TASK_QUEUE_NAME = data['queue'][
                'logClicksTaskQueueName']
            self.sleep_time = int(data['queue']['logClicksTaskSleepTime'])
            self.news_db_collection = data['mongoDb']['newsMongoDbCollection']
            self.prefer_db_collection = data['mongoDb'][
                'preferMongoDbCollection']
            self.alpha = float(data['clicksModel']['alpha'])
            self.daydelta = int(data['clicksModel']['daydelta'])

        self.logclick_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                               LOG_CLICKS_TASK_QUEUE_NAME)
        self.db = mongodb_client.get_db()
        self.cnt_list = {}
        self.clicks_list = {}
        for i in news_classes.classes:
            self.cnt_list[i] = 0
Ejemplo n.º 12
0
 def __call__(self):      
     self.cloudAMQP_client = CloudAMQPClient(self.dedupe_news_task_queue_url, self.dedupe_news_task_queue_name)
     self.db = mongodb_client.get_db()
     num_unique_news = 0
     while True:
         if self.cloudAMQP_client is not None:
             msg = self.cloudAMQP_client.getMessage()
             if msg is not None:
                 # Parse and process the task
                 try:
                     if self.handle_messages(msg):
                         num_unique_news += 1
                     else:
                         print "invalid msg"
                 except Exception as e:
                     print e
                     pass
                 self.cloudAMQP_client.sleep(self.sleep_time_in_seconds)
             else:
                 print "Store %d unique news in mongoDb" % num_unique_news
                 self.cloudAMQP_client.close()
                 break
Ejemplo n.º 13
0
### REPLACE CLOUD_AMQP_URL WITH YOUR OWN ###
CLOUD_AMQP_URL = '''amqp://*****:*****@hyena.rmq.cloudamqp.com/htwgtamk'''
EMAIL_SENDER_QUEUE_NAME = 'emailSenderTaskQueue'

# mongodb config
PROPERTY_TABLE_NAME = 'property'
USERS_PERMISSIN_DATABASE = 'real-estate-smart-view'
USERS_EMAILSENDER_PERMISSION = 'users'
FETCH_SIMILAR_PROPERTIES = True

SECONDS_IN_ONE_DAY = 3600 * 24
SECONDS_IN_ONE_WEEK = SECONDS_IN_ONE_DAY * 7

WAITING_TIME = 3

cloudAMQP_client = CloudAMQPClient(CLOUD_AMQP_URL, EMAIL_SENDER_QUEUE_NAME)


def getPermissionList():
    userDB = mongodb_client.getDB(USERS_PERMISSIN_DATABASE)
    sig = True
    users = []
    for user in userDB[USERS_EMAILSENDER_PERMISSION].find(
        {'emailSend_permission': sig}):
        users.append(user['email'])
    return users


def handle_message(msg):
    task = json.loads(msg)
    #task = msg
Ejemplo n.º 14
0
NEWS_CONFIG_FILE = os.path.join(os.path.dirname(__file__), '..',
                                'config/news.yaml')

with open(DB_CONFIG_FILE, 'r') as dbCfg:
    db_config = yaml.load(dbCfg)

with open(CLOUDAMQP_CONFIG_FILE, 'r') as amqpCfg:
    cloudAMQP_config = yaml.load(amqpCfg)

with open(NEWS_CONFIG_FILE, 'r') as newsCfg:
    news_config = yaml.load(newsCfg)

# loading configuration from yaml files
redis_client = redis.StrictRedis(db_config['redis']['host'],
                                 db_config['redis']['port'])
cloudAMQP_client = CloudAMQPClient(cloudAMQP_config['url'],
                                   cloudAMQP_config['scrape_queue_name'])

while True:
    news_list = news_api_client.getNewsFromSources(news_config['news_sources'])
    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            # If 'publishedAt' is None, set it to current UTC time
            if news['publishedAt'] is None:
Ejemplo n.º 15
0
DEDUPE_NEWS_TASK_QUEUE_NAME = "test1"

SLEEP_TIME_IN_SECONDS = 1
"""
NEWS_TABLE_NAME = "news-test"

SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

with open('../configuration/news_pipeline_conf.yaml', 'r') as stream:
    try:
        config = yaml.load(stream)
    except yaml.YAMLError as error3:
        print error3

cloudAMQP_client = CloudAMQPClient(
    config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_URL'],
    config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_NAME'])


def handle_message(msg):
    #if msg is None or not isinstance(msg, dict):
    #return

    task = msg
    text = task['text']
    if text is None:
        #print 'how are you'
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
Ejemplo n.º 16
0
from dateutil import parser
from sklearn.feature_extraction.text import TfidfVectorizer

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import ENV
import mongodb_client
import news_topic_modeling_service_client

from cloudAMQP_client import CloudAMQPClient

DEDUPE_NEWS_TASK_QUEUE_URL = ENV.DEDUPE_NEWS_TASK_QUEUE_URL
DEDUPE_NEWS_TASK_QUEUE_NAME = ENV.DEDUPE_NEWS_TASK_QUEUE_NAME

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                           DEDUPE_NEWS_TASK_QUEUE_NAME)
SLEEP_TIME_IN_SECONDS = 1
NEWS_TABLE_NAME = ENV.NEWS_TABLE_NAME

SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
Ejemplo n.º 17
0
from cloudAMQP_client import CloudAMQPClient

# REPLACE URL WITH YOUR OWN
CLOUDAMQP_URL = 'amqp://*****:*****@rhino.rmq.cloudamqp.com/aznxmzpt'
QUEUE_NAME = 'dataFetcherTaskQueue'

# Initialize a client
client = CloudAMQPClient(CLOUDAMQP_URL, QUEUE_NAME)

# Send a message
client.sendDataFetcherTask({'zpid' : '83154148'})


# Receive a message
#client.getDataFetcherTask()
import mongodb_client # pylint: disable=import-error, wrong-import-position
from cloudAMQP_client import CloudAMQPClient
import news_recommendation_service_client

NEWS_TABLE_NAME = 'news'

REDIS_HOST = 'localhost'
REDIS_PORT = 6379

NEWS_LIMIT = 200
NEWS_LIST_SIZE = 10
USER_NEWS_TIME_OUT = 3600 * 24

CLICK_LOGGER_QUEUE_URL = 'amqp://*****:*****@skunk.rmq.cloudamqp.com/wznspxdt'
CLICK_LOGGER_QUEUE_NAME = 'Click_Logger'
click_logger_client = CloudAMQPClient(CLICK_LOGGER_QUEUE_URL, CLICK_LOGGER_QUEUE_NAME)

def get_one_news():
    """Get one news from MONGODB. """
    db = mongodb_client.get_db()
    news = db[NEWS_TABLE_NAME].find_one()
    return news

def get_news_summaries_for_user(user_id, page_num):
    """Get news list from MongoDB. """
    redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT)
    db = mongodb_client.get_db()

    news_index_begin = (int(page_num) - 1)* NEWS_LIST_SIZE
    news_index_end = news_index_begin + NEWS_LIST_SIZE
    sliced_news = []
Ejemplo n.º 19
0
REDIS_HOST = "localhost"
REDIS_PORT = 6379

NEWS_TABLE_NAME = "news"
CLICK_LOGS_TABLE_NAME = 'click_logs'

NEWS_LIMIT = 100
NEWS_LIST_BATCH_SIZE = 10
USER_NEWS_TIME_OUT_IN_SECONDS = 60

# TODO: Use my own queue
LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@crocodile.rmq.cloudamqp.com/pamjhzbo"
LOG_CLICKS_TASK_QUEUE_NAME = "tap-news-log-clicks-task-queue"

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0)
cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                   LOG_CLICKS_TASK_QUEUE_NAME)


def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE

    # The final list of news to be returned.
    sliced_news = []

    if redis_client.get(user_id) is not None:
        news_digests = pickle.loads(redis_client.get(user_id))

        # If begin_index is out of range, this will return empty list;
        # If end_index is out of range (begin_index is within the range), this
Ejemplo n.º 20
0
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24
SLEEP_TIME_IN_SECOUNDS = 10

# Use your own Cloud AMQP queue
SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/yruquhmv"
SCRAPE_NEWS_TASK_QUEUE_NAME = "SCRAPE_NEWS_TASK"

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

    num_of_new_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
Ejemplo n.º 21
0
    config = json.load(json_data_file)

REDIS_HOST = config['redis']['newsMonitor']['host']
REDIS_PORT = config['redis']['newsMonitor']['port']
CLOUD_AMQP_NAME = config['cloudAMQP']['scraperTaskQueue']['url']
QUEUE_NAME = config['cloudAMQP']['scraperTaskQueue']['name']
SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['name']

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT)
message_queue = CloudAMQPClient(CLOUD_AMQP_NAME, QUEUE_NAME)


def monitorOn(newsAPI_client=news_api_client):
    while True:
        news_list = newsAPI_client.getNewsList(sources=NEWS_SOURCES)
        num_of_news = 0

        for news in news_list:
            # create digest for Redis checking duplicates
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()
            if redis_client.get(news_digest) is None:
                num_of_news += 1
                news['digest'] = news_digest
Ejemplo n.º 22
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'utils'))

from cloudAMQP_client import CloudAMQPClient
import mongodb_client

# Read configuration file
with open("../config.yml", 'r') as ymlfile:
    config = yaml.load(ymlfile)

AMQP_URL = config['AMQP_URL']
DEDUPE_NEWS_QUEUE_NAME = 'top-news-dedupe-news-queue'
SLEEP_TIME_OUT_IN_SECONDS = 1
NEWS_TABLE_NAME = 'news'
NEWS_SIMILARITY_THRESHOLD = 0.8
# Connect dedupe queue
cloudAMQP_dedupe_client = CloudAMQPClient(AMQP_URL, DEDUPE_NEWS_QUEUE_NAME)
# Connect Mongo DB
db = mongodb_client.get_db()

# start to work
while True:
    # Get message from dedupe queue
    message = cloudAMQP_dedupe_client.getMessage()

    # handle the message
    if message is not None:
        try:
            handle_message(message)
        except Exception as e:
            print e
            pass
Ejemplo n.º 23
0
import sys
from newspaper import Article
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import config_service_client
from cloudAMQP_client import CloudAMQPClient

# prepare logging
import logging
logging.basicConfig(filename='./logging/news_pipeline.log', level=logging.INFO)

# ask for mq configs
scrape_mq_config = config_service_client.getMessagequeueConfigForUsecase(
    'scrape_news_task')
dedupe_mq_config = config_service_client.getMessagequeueConfigForUsecase(
    'dedupe_news_task')
scrape_news_queue_client = CloudAMQPClient(scrape_mq_config['queue_url'],
                                           scrape_mq_config['queue_name'])
dedupe_news_queue_client = CloudAMQPClient(dedupe_mq_config['queue_url'],
                                           dedupe_mq_config['queue_name'])

# ask for other configs
fetch_config = config_service_client.getPipelineConfigForSection(
    'news_fetcher')
scrape_sleeptime_seconds = int(
    fetch_config['scrape_queue_client_sleeptime_seconds'])
dedupe_sleeptime_seconds = int(
    fetch_config['dedupe_queue_client_sleeptime_seconds'])


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message from news_to_scrape is broken'
Ejemplo n.º 24
0
import mongodb_client
import zillow_api_client
import zillow_web_scraper_client

from cloudAMQP_client import CloudAMQPClient

# Automatically feed zpids into queue
### REPLACE CLOUD_AMQP_URL WITH YOUR OWN ###
CLOUD_AMQP_URL = '''amqp://*****:*****@hyena.rmq.cloudamqp.com/htwgtamk'''
DATA_FETCHER_QUEUE_NAME = 'dataFetcherTaskQueue'
ZIPCODE_FILE = 'bay_area_zipcode_list.txt'

WAITING_TIME = 3

cloudAMQP_client = CloudAMQPClient(CLOUD_AMQP_URL, DATA_FETCHER_QUEUE_NAME)

zipcode_list = []

with open(ZIPCODE_FILE, 'r') as zipcode_file:
    for zipcode in zipcode_file:
        zipcode_list.append(str(zipcode))

for zipcode in zipcode_list:
    zpids = zillow_web_scraper_client.search_zillow_by_zip(zipcode)
    time.sleep(WAITING_TIME)

    for zpid in zpids:
        cloudAMQP_client.sendDataFetcherTask({'zpid': zpid})

Ejemplo n.º 25
0
import mongodb_client
import news_topic_modeling_service_client
from cloudAMQP_client import CloudAMQPClient

# TODO: use your own queue.
DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/lkvdaice"
DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-deduper-news-task-queue"

SLEEP_TIME_IN_SECONDS = 1

NEWS_TABLE_NAME = "news_test"

SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
from bson.json_util import dumps
from datetime import datetime

# import common packages in parent directory.
# add the utils into the path where python will serach the package from
# https://api.mongodb.com/python/current/installation.html
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import mongodb_client  #pylint: disable=import-error, wrong-import-position
import news_recommendation_service_client  #pylint: disable=import-error, wrong-import-position

# import common package in parent directory
from cloudAMQP_client import CloudAMQPClient  #pylint: disable=import-error, wrong-import-position
CLICK_QUEUE_URL = ""
ClICK_QUEUE_NAME = "news-click-queue"
click_client = CloudAMQPClient(CLICK_QUEUE_URL, ClICK_QUEUE_NAME)

REDIS_HOST = "localhost"
REDIS_PORT = 6379

NEWS_LIST_BATCH_SIZE = 10
NEWS_LIMIT = 200
USER_NEWS_TIMEOUT_IN_SECONDS = 60 * 60  # one hour

# mongoimport --db tap_news_dev --collection news --drop --file ~/downloads/demo_news.json
NEWS_TABLE_NAME = "news"

redis_client = redis.StrictRedis()


def getOneNews():
Ejemplo n.º 27
0
def test_basic():
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)
    sentMsg = {'test': 'demo'}
    client.sendMessage(sentMsg)
    client.sleep(1)
    receiveMsg = client.getMessage()
Ejemplo n.º 28
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
from cloudAMQP_client import CloudAMQPClient

DEDUPE_NEWS_TASK_CLOUDAMQP_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/fauunoyn"
DEDUPE_NEWS_TASK_CLOUDAMQP_NAME = "news-dedupe-task-queue"

SLEEP_TIME_IN_SECONDS = 1

NEWS_TABLE_NAME = 'newslist'

SAME_NEWS_SIMILARITY_THRESHOLD = 0.8
# global cloudAMQP_Client
cloudAMQP_Client = CloudAMQPClient(DEDUPE_NEWS_TASK_CLOUDAMQP_URL,
                                   DEDUPE_NEWS_TASK_CLOUDAMQP_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = str(task['text'].encode('utf-8'))
    if text is None:
        return

    # Get all recent news
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
Ejemplo n.º 29
0
with open(os.path.join(os.path.dirname(__file__), '..', "config.yaml"),
          'r') as config_file:
    config = yaml.load(config_file)

# Use your own Cloud AMQP queue
DEDUPE_NEWS_TASK_QUEUE_URL = config["amqp"]["dedupe"]["url"]
DEDUPE_NEWS_TASK_QUEUE_NAME = config["amqp"]["dedupe"]["name"]
SCRAPE_NEWS_TASK_QUEUE_URL = config["amqp"]["scrape"]["url"]
SCRAPE_NEWS_TASK_QUEUE_NAME = config["amqp"]["scrape"]["name"]

SLEEP_TIME_IN_SECONDS = config["news_pipeline"]["fetcher"]["sleep_in_seconds"]
CUSTOM_SCRAPER_SOURCES = config["news_pipeline"]["fetcher"][
    "custom_scraper_sources"]

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                           DEDUPE_NEWS_TASK_QUEUE_NAME)
scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                           SCRAPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return

    task = msg
    source = task['source']
    source_url = task['url']
    article_text = ""

    if source in CUSTOM_SCRAPER_SOURCES:
Ejemplo n.º 30
0
REDIS_HOST = db_config['redis']['host']
REDIS_PORT = db_config['redis']['port']

NEWS_TABLE_NAME = db_config['mongodb']['read_news_table']
CLICK_LOG_TABLE_NAME = db_config['mongodb']['click_log_table']

NEWS_CLASSES = news_config['classes']
NEWS_LIMIT = news_config['read_news_limit']
NEWS_PER_PAGE = news_config['news_per_page']
USER_NEWS_TIME_OUT_IN_SECONDS = news_config['user_timeout_in_seconds']

redis_client = redis.StrictRedis(db_config['redis']['host'],
                                 db_config['redis']['port'],
                                 db=db_config['redis']['strict_db'])
cloudAMQP_client = CloudAMQPClient(cloudAMQP_config['url'],
                                   cloudAMQP_config['click_log_queue_name'])


def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * NEWS_PER_PAGE
    end_index = page_num * NEWS_PER_PAGE

    # The final list of news to be returned
    sliced_news = []

    if redis_client.get(user_id) is not None:
        print 'Pulling page %s from Redis...' % page_num
        news = pickle.loads(redis_client.get(user_id))

        # If begin_index is out of range, this will return empty list;