Ejemplo n.º 1
0
def test_basic():
    client = CloudAMQPClient(CLOUDAMQP_URL, TEST_QUEUE_NAME)

    sentMsg = {'test':'demo'}
    client.sendMessage(sentMsg)
    client.sleep(10)
    receivedMsg = client.getMessage()
    assert sentMsg == receivedMsg
    print 'test_basic passed!'
# RabbitMQ config
### REPLACE CLOUD_AMQP_URL WITH YOUR OWN ###
CLOUD_AMQP_URL = ''''''
DATA_FETCHER_QUEUE_NAME = 'dataFetcherTaskQueue'

# mongodb config
PROPERTY_TABLE_NAME = 'property'

FETCH_SIMILAR_PROPERTIES = True

SECONDS_IN_ONE_DAY = 3600 * 24
SECONDS_IN_ONE_WEEK = SECONDS_IN_ONE_DAY * 7

WAITING_TIME = 3

cloudAMQP_client = CloudAMQPClient(CLOUD_AMQP_URL, DATA_FETCHER_QUEUE_NAME)


def handle_message(msg):
    task = json.loads(msg)

    if (not isinstance(task, dict) or not 'zpid' in task
            or task['zpid'] is None):
        return

    zpid = task['zpid']

    # Scrape the zillow for details
    property_detail = zillow_web_scraper_client.get_property_by_zpid(zpid)
    # Add timestamp
    property_detail['last_update'] = time.time()
Ejemplo n.º 3
0
from cloudAMQP_client import CloudAMQPClient

CLOUDAMQP_URL = 'amqp://*****:*****@hyena.rmq.cloudamqp.com/kdflangt'
QUEUE_NAME = 'dataFetcherTaskQueue'

# init a client
client = CloudAMQPClient(CLOUDAMQP_URL, QUEUE_NAME)

# send a message
client.sendDataFetcherTask({'zpid': '83154148'})

# receive a message
# client.getDataFetcherTask()
Ejemplo n.º 4
0
import operations
import os
import sys

from sets import Set

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
CLICK_LOGS_TABLE_NAME = "click_logs"

from cloudAMQP_client import CloudAMQPClient
LOG_CLICKS_TASK_QUEUE_URL = "localhost"
LOG_CLICKS_TASK_QUEUE_NAME = "tap-news-log-clicks-task-queue"
cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                   LOG_CLICKS_TASK_QUEUE_NAME)

# Start Redis and MongoDB before running following tests.


def test_getNewsSummariesForUser_basic():
    news = operations.getNewsSummariesForUser('test', 1)
    print news
    assert len(news) > 0
    print 'test_getNewsSummariesForUser_basic passed!'


def test_getNewsSummariesForUser_pagination():
    news_page_1 = operations.getNewsSummariesForUser('test', 1)
    news_page_2 = operations.getNewsSummariesForUser('test', 2)
Ejemplo n.º 5
0
import mongodb_client
import news_topic_modeling_service_client

from cloudAMQP_client import CloudAMQPClient

# TODO: use your own queue.
DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@gull.rmq.cloudamqp.com/gxaifheg"
DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue"

SLEEP_TIME_IN_SECONDS = 1

# change the table that store the news
NEWS_TABLE_NAME = "news"
SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return

    task = msg
    text = task['text']
    if text is None:
        return

    # Get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
Ejemplo n.º 6
0
import os
import sys
import news_deduper as deduper
# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
from cloudAMQP_client import CloudAMQPClient

DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@otter.rmq.cloudamqp.com/xemwnewz"
DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue"

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)

TEST_MSG1 = ""
TEST_MSG2 = cloudAMQP_client.get_message()


def test_basic():
    deduper.handle_message(TEST_MSG1)
    deduper.handle_message(TEST_MSG2)
    print('test_basic passed!')


if __name__ == "__main__":
    test_basic()
Ejemplo n.º 7
0
                              'config/databases.yaml')
CLOUDAMQP_CONFIG_FILE = os.path.join(os.path.dirname(__file__), '..',
                                     'config/cloudAMQP.yaml')

with open(DB_CONFIG_FILE, 'r') as dbCfg:
    db_config = yaml.load(dbCfg)

with open(CLOUDAMQP_CONFIG_FILE, 'r') as amqpCfg:
    cloudAMQP_config = yaml.load(amqpCfg)

with open(NEWS_CONFIG_FILE, 'r') as newsCfg:
    news_config = yaml.load(newsCfg)

NEWS_TABLE_NAME = db_config['mongodb']['write_news_table']

dedupe_news_queue_client = CloudAMQPClient(
    cloudAMQP_config['url'], cloudAMQP_config['dedupe_queue_name'])


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return

    task = msg
    text = task['text'].encode('utf-8', 'ignore')
    if text == '':
        print 'Message has not text!'
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
#import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
#sys.path.append(os.path.join(os.path.dirname(__file__),'./','scrapers'))
import log_client
#import cnn_news_scraper
from cloudAMQP_client import CloudAMQPClient

LOG_GRAPHITE_TASK_QUEUE_URL = config['operations'][
    'LOG_GRAPHITE_TASK_QUEUE_URL']
LOG_GRAPHITE_TASK_QUEUE_NAME = config['operations'][
    'LOG_GRAPHITE_TASK_QUEUE_NAME']

SLEEP_TIME_IN_SECONDS = config['operations']['SLEEP_TIME_IN_SECONDS']

graphitelog_cloudAMQP_client = CloudAMQPClient(LOG_GRAPHITE_TASK_QUEUE_URL,
                                               LOG_GRAPHITE_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None:
        log_client.logger.info('message is broken')
        #print 'message is broken'
        return

    counter = statsd.Counter(msg)
    counter += 1


while True:
    #fetch message from queue
    if graphitelog_cloudAMQP_client is not None:
Ejemplo n.º 9
0
                'the-washington-post',
                'the-wall-street-journal',
                'usa-today'
                ]

# redis params
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
NEWS_TIMEOUT_IN_SECONDS = 3600 * 24 * 3
redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)

# cloudAMQP params
CLOUDAMQP_SLEEP_TIMEOUT_IN_SECONDS = 10
CLOUDAMQP_NEWS_SCRAPER_QUEUE_URL = 'amqp://*****:*****@crane.rmq.cloudamqp.com/ceunvrpr'
CLOUDAMQP_NEWS_SCRAPER_QUEUE_NAME = 'tap-news-scrape-news-task-queue'
cloudAMQP_client = CloudAMQPClient(CLOUDAMQP_NEWS_SCRAPER_QUEUE_URL, CLOUDAMQP_NEWS_SCRAPER_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSources(NEWS_SOURCES)

    num_of_news = 0

    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            news['digest'] = news_digest
            num_of_news += 1

            if news['publishedAt'] is None:
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
Ejemplo n.º 10
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.append(os.path.join(os.path.dirname(__file__), 'scrapers'))

import cnn_news_scraper
from cloudAMQP_client import CloudAMQPClient

# Use your own Cloud AMQP queue
DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/ztdiqmcn"
DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue"
SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/ztdiqmcn"
SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue"

SLEEP_TIME_IN_SECONDS = 5

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)
scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return
    task = msg
    
    article = Article(task['url'])
    article.download()
    article.parse()

    print article.text

    task['text'] = article.text
    dedupe_news_queue_client.sendMessage(task)
Ejemplo n.º 11
0
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24  # news expires in one day
SLEEP_TIME_IN_SECOUNDS = 10  # every 10 seconds for every loop

SCRAPE_NEWS_TASK_QUEUE_URL = 'amqps://*****:*****@chimpanzee.rmq.cloudamqp.com/lkscpwqu'

SCRAPE_NEWS_TASK_QUEUE_NAME = 'feeder-scrape-news-task-queue'

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
    "the-verge"
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
    num_of_new_news = 0
    for news in news_list:
        # news_digest = base64.b64encode(hashlib.md5(news['title'].encode('utf-8')).digest())
        # news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')
        news_digest = hashlib.md5(news['title'].encode('utf-8')).hexdigest()
        # print type(news_digest)
        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest  # every news has unique digest

        # use utc time to avoid different time zones
        if news['publishedAt'] is None:
Ejemplo n.º 12
0
REDIS_HOST = config['redis']['HOST']
REDIS_PORT = config['redis']['PORT']

NEWS_LIST_BATCH_SIZE = config['backend_server'][
    'NEWS_LIST_BATCH_SIZE']  # number of news in single page
NEWS_LIMIT = config['backend_server'][
    'NEWS_LIMIT']  # maximum number of news of one fetch from mongoDB
USER_NEWS_TIME_OUT_IN_SECONDS = config['backend_server'][
    'USER_NEWS_TIME_OUT_IN_SECONDS']  # timeout for user's pagination info in redis

CLICK_LOG_TASK_QUEUE_URL = config['cloudAMQP']['CLICK_LOG_TASK_QUEUE_URL']
CLICK_LOG_TASK_QUEUE_NAME = config['cloudAMQP']['CLICK_LOG_TASK_QUEUE_NAME']

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0)
cloudAMQP_client = CloudAMQPClient(CLICK_LOG_TASK_QUEUE_URL,
                                   CLICK_LOG_TASK_QUEUE_NAME)


def getOneNews():
    db = mongodb_client.get_db()
    news = db[NEWS_TABLE_NAME].find_one()
    return news


def getNewsSummariesForUser(user_id, page_num):

    page_num = int(page_num)
    # news range to be fetched for the page number
    begin_index = (page_num - 1) * NEWS_LIST_BATCH_SIZE
    end_index = page_num * NEWS_LIST_BATCH_SIZE
Ejemplo n.º 13
0
import sys

from newspaper import Article

sys.path.append(os.path.join(os.path.dirname(__file__), '../backend_server/', 'utils'))

from cloudAMQP_client import CloudAMQPClient

DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/thclcviw"
DEDUPE_NEWS_TASK_QUEUE_NAME = "top-new-DEDUPE_NEWS_TASK_QUEUE_NAME"
SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@emu.rmq.cloudamqp.com/qfgxyvvk"
SCRAPE_NEWS_TASK_QUEUE_NAME = "top-news-SCRAPE_NEWS_TASK_QUEUE"

SLEEP_TIME_IN_SECONDS = 5

scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is broken')
        return

    task = msg
    text = None

    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)
Ejemplo n.º 14
0
NEWS_LIST_BATCH_SIZE = 10
USER_NEWS_TIME_OUT_IN_SECONDS = 60

LOG_CLICKS_TASK_QUEUE_URL = "amqp://*****:*****@crane.rmq.cloudamqp.com/crxnfwlj"
LOG_CLICKS_TASK_QUEUE_NAME = "clickLog"
"""

with open('../configuration/backend_conf.yaml', 'r') as stream:
    try:
        config = yaml.load(stream)
    except yaml.YAMLError as error6:
        print error6

redis_client = redis.StrictRedis(config['operations']['REDIS_HOST'],
                                 int(config['operations']['REDIS_PORT']), db=0)
cloudAMQP_client = CloudAMQPClient(config['operations']['LOG_CLICKS_TASK_QUEUE_URL'],
                                  config['operations']['LOG_CLICKS_TASK_QUEUE_NAME'])

def getNewsSummariesForUser(user_id, page_num):
    page_num = int(page_num)
    begin_index = (page_num - 1) * config['operations']['NEWS_LIST_BATCH_SIZE']
    end_index = page_num * int(config['operations']['NEWS_LIST_BATCH_SIZE'])

    # The final list of news to be returned.
    sliced_news = []

    if redis_client.get(user_id) is not None:
        news_digests = pickle.loads(redis_client.get(user_id))

        # If begin_index is out of range, this will return empty list;
        # If end_index is out of range (begin_index is within the range), this
        # will return all remaining news ids.
Ejemplo n.º 15
0
from newspaper import Article

sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
from cloudAMQP_client import CloudAMQPClient

with open('../config.json') as json_data_file:
    config = json.load(json_data_file)

SCRAPE_TASK_QUEUE_URL = config['cloudAMQP']['scraperTaskQueue']['url']
SCRAPE_TASK_QUEUE_NAME = config['cloudAMQP']['scraperTaskQueue']['name']
DEDUPE_TASK_QUEUE_URL = config['cloudAMQP']['deduperTaskQueue']['url']
DEDUPE_TASK_QUEUE_NAME = config['cloudAMQP']['deduperTaskQueue']['name']

SLEEP_IN_SECONDS = config['cloudAMQP']['scraperTaskQueue']['sleep']

scrape_task_mq_client = CloudAMQPClient(SCRAPE_TASK_QUEUE_URL, SCRAPE_TASK_QUEUE_NAME)
dedupe_task_mq_client = CloudAMQPClient(DEDUPE_TASK_QUEUE_URL, DEDUPE_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is invalid')
        return 

    task = msg

    article = Article(task['url'])
    article.download()
    article.parse()
    task['text'] = article.text
    dedupe_task_mq_client.sendMessage(task)
Ejemplo n.º 16
0
from cloudAMQP_client import CloudAMQPClient

DEDUPE_NEWS_TASK_QUEUE_URL = config['news_pipeline']['news_deduper'][
    'DEDUPE_NEWS_TASK_QUEUE_URL']
DEDUPE_NEWS_TASK_QUEUE_NAME = config['news_pipeline']['news_deduper'][
    'DEDUPE_NEWS_TASK_QUEUE_NAME']

SLEEP_TIME_IN_SECONDS = config['news_pipeline']['news_deduper'][
    'SLEEP_TIME_IN_SECONDS']

SAME_NEWS_SIMILARITY_THRESHOLD = config['news_pipeline']['news_deduper'][
    'SAME_NEWS_SIMILARITY_THRESHOLD']

NEWS_TABLE_NAME = config['news_pipeline']['news_deduper']['NEWS_TABLE_NAME']

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                   DEDUPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = str(task['text'])
    if text is None:
        return

    # Get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
Ejemplo n.º 17
0
# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import news_topic_modeling_service_client
from cloudAMQP_client import CloudAMQPClient

with open('../configuration/news_pipeline_conf.yaml', 'r') as stream:
    try:
        config = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

cloudAMQP_client = CloudAMQPClient(
    config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_URL'],
    config['news_deduper']['DEDUPE_NEWS_TASK_QUEUE_NAME'])


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return
    ''' get news from database with similar time '''
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
Ejemplo n.º 18
0
from cloudAMQP_client import CloudAMQPClient

REDIS_HOST = 'localhost'
REDIS_PORT = 6379

NEWS_TABLE_NAME = '[top-news]'  # table name in MongoDb
PAGE_SIZE = 10
NEWS_LIMIT = 100
USER_NEWS_TIME_OUT_IN_SECONDS = 60  # timeout for user's pagination info in Redis

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT, db=0)

USER_CLICK_LOG_QUEUE_URL = 'amqp://*****:*****@otter.rmq.cloudamqp.com/xhzhqriu'
USER_CLICK_LOG_QUEUE_NAME = 'UserClickLogQueue'
cloudAMQP_client = CloudAMQPClient(USER_CLICK_LOG_QUEUE_URL,
                                   USER_CLICK_LOG_QUEUE_NAME)


def getOneNews():
    db = mongodb_client.get_db()
    news = db[NEWS_TABLE_NAME].find_one()
    # bson.json_util.dumps transform bson to serialized string
    # unserialize string and return json object
    return json.loads(dumps(news))


def getNewsCount():
    db = mongodb_client.get_db()
    count = db[NEWS_TABLE_NAME].count()
    return count
import cnn_news_scraper #pylint: disable=import-error, wrong-import-position

SCRAPER_QUEUE_URL = ""
SCRAPER_QUEUE_NAME = "news-scraper-queue"

DEDUPER_QUEUE_URL = ""
DEDUPER_QUEUE_NAME = "news-deduper-queue"

SLEEP_TIME_IN_SECONDS = 5

logger_format = '%(asctime)s - %(message)s'
logging.basicConfig(format=logger_format)
logger = logging.getLogger('news_fetcher')
logger.setLevel(logging.DEBUG)

scrape_queue_client = CloudAMQPClient(SCRAPER_QUEUE_URL, SCRAPER_QUEUE_NAME)
dedupe_queue_client = CloudAMQPClient(DEDUPER_QUEUE_URL, DEDUPER_QUEUE_NAME)

def handle_message_old(msg):
  # if the msg is not json format
  if not isinstance(msg, dict):
    logger.warning('message is broken')
    return
  
  text = None
  if msg['source'] == 'cnn':
    text = cnn_news_scraper.extract_news(msg['url'])

  if text is not None and len(text) > 0:
    msg['text'] = text
    dedupe_queue_client.sendMessage(msg)
DEDUPER_QUEUE_URL = ""
DEDUPER_QUEUE_NAME = "news-deduper-queue"

SLEEP_TIME_IN_SECONDS = 1

NEWS_TABLE_NAME = "news_list"

SAME_NEWS_SIMILARITY_THRESHOLD = 0.82

logger_format = '%(asctime)s - %(message)s'
logging.basicConfig(format=logger_format)
logger = logging.getLogger('news_deduper')
logger.setLevel(logging.DEBUG)

cloudAMQP_client = CloudAMQPClient(DEDUPER_QUEUE_URL, DEDUPER_QUEUE_NAME)

stemmer = PorterStemmer()
translator = str.maketrans('','',string.punctuation)

def stem_tokens(tokens, stemmer):
  stemmed = []
  for item in tokens:
    stemmed.append(stemmer.stem(item))
  return stemmed

def tokenize(text):
  tokens = nltk.word_tokenize(text)
  stems = stem_tokens(tokens, stemmer)
  return stems
Ejemplo n.º 21
0
stream = open("../config.yml", "r")
load = yaml.load(stream)
config = load['default']['common']

DEDUPE_NEWS_TASK_QUEUE_URL = config['cloudAMQP']['DEDUPE_NEWS_TASK_QUEUE_URL']
DEDUPE_NEWS_TASK_QUEUE_NAME = config['cloudAMQP'][
    'DEDUPE_NEWS_TASK_QUEUE_NAME']

SCRAPE_NEWS_TASK_QUEUE_URL = config['cloudAMQP']['SCRAPE_NEWS_TASK_QUEUE_URL']
SCRAPE_NEWS_TASK_QUEUE_NAME = config['cloudAMQP'][
    'SCRAPE_NEWS_TASK_QUEUE_NAME']

SLEEP_IN_SECOND = config['cloudAMQP']['FETCHER_SLEEP_IN_SECOND']

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                           DEDUPE_NEWS_TASK_QUEUE_NAME)
scrap_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                          SCRAPE_NEWS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'msg is broken'
        return
    task = msg

    # text = None
    # We support CNN only now
    # if task['source'] == 'cnn':
    #     print 'Scrapping CNN news'
    #     text = scraper.cnn_news_scraper.extract_news(task['url'])
Ejemplo n.º 22
0
from newspaper import Article

sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
sys.path.append(os.path.join(os.path.dirname(__file__), 'scrapers'))

import cnnNewsScraper
from cloudAMQP_client import CloudAMQPClient

SCRAPE_NEWS_TASK_CLOUDAMQP_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/fauunoyn"
SCRAPE_NEWS_TASK_CLOUDAMQP_NAME = "news-scrape-task-queue"
DEDUPE_NEWS_TASK_CLOUDAMQP_URL = "amqp://*****:*****@wasp.rmq.cloudamqp.com/fauunoyn"
DEDUPE_NEWS_TASK_CLOUDAMQP_NAME = "news-dedupe-task-queue"

SLEEP_TIME_IN_SECONDS = 5

dedupe_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_CLOUDAMQP_URL,
                                      DEDUPE_NEWS_TASK_CLOUDAMQP_NAME)
scrape_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_CLOUDAMQP_URL,
                                      SCRAPE_NEWS_TASK_CLOUDAMQP_NAME)


def handle_message(msg):
    global dedupe_queue_client

    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return

    task = msg

    article = Article(task['url'])
    article.download()
Ejemplo n.º 23
0
from cloudAMQP_client import CloudAMQPClient

CLOUDAMQP_URL = 'amqp://*****:*****@hyena.rmq.cloudamqp.com/jnqrsjwd'
QUEUE_NAME = 'dataFetcherTaskQueue'

# Initialize a client
client = CloudAMQPClient(CLOUDAMQP_URL, QUEUE_NAME)

# Send a message
client.sendDataFetcherTask({'name': 'test message'})

# Receive a message
# client.getDataFetcherTask()
Ejemplo n.º 24
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import news_api_client
from cloudAMQP_client import CloudAMQPClient

with open('../configuration/news_pipeline_conf.yaml', 'r') as stream:
    try:
        config = yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)

redis_client = redis.StrictRedis(config['news_monitor']['REDIS_HOST'],
                                 int(config['news_monitor']['REDIS_PORT']))

cloudAMQP_client = CloudAMQPClient(
    config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_URL'],
    config['news_monitor']['SCRAPE_NEWS_TASK_QUEUE_NAME'])

while True:
    news_list = news_api_client.getNewsFromSource(
        config['news_monitor']['NEWS_SOURCES'])

    num_of_news_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode(
                'base64')  # digest can be used as a unique ID

        if redis_client.get(news_digest) is None:
            '''new news coming in'''
Ejemplo n.º 25
0
NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3

REDIS_HOST = 'localhost'
REDIS_PORT = 6379

SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@otter.rmq.cloudamqp.com/cwvictdm"
SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue"

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)


def run():
    while True:
        news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

        num_of_news_news = 0

        for news in news_list:
            news_digest = hashlib.md5(
                news['title'].encode('utf-8')).hexdigest()

            if redis_client.get(news_digest) is None:
                num_of_news_news = num_of_news_news + 1
                news['digest'] = news_digest
Ejemplo n.º 26
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
from cloudAMQP_client import CloudAMQPClient

# Use your own Cloud AMQP queue
DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@donkey.rmq.cloudamqp.com/ztdiqmcn"
DEDUPE_NEWS_TASK_QUEUE_NAME = "tap-news-dedupe-news-task-queue"

SLEEP_TIME_IN_SECONDS = 1

NEWS_TABLE_NAME = "news"

SAME_NEWS_SIMILARITY_THRESHOLD = 0.8

cloudAMQP_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return
    task = msg
    text = str(task['text'])
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
Ejemplo n.º 27
0
from cloudAMQP_client import CloudAMQPClient

# Don't modify this value unless you know what you are doing.
NUM_OF_CLASSES = 17
INITIAL_P = 1.0 / NUM_OF_CLASSES
ALPHA = 0.1

SLEEP_TIME_IN_SECONDS = 1

LOG_CLICKS_TASK_QUEUE_URL = 'amqp://*****:*****@donkey.rmq.cloudamqp.com/zylpcqxg'
LOG_CLICKS_TASK_QUEUE_NAME = 'tap-news-log-clicks-task-queue'

PREFERENCE_MODEL_TABLE_NAME = 'user_preference_model'
NEWS_TABLE_NAME = 'news-test'

cloudAMQP_client = CloudAMQPClient(LOG_CLICKS_TASK_QUEUE_URL,
                                   LOG_CLICKS_TASK_QUEUE_NAME)


def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return

    if ('userId' not in msg or 'newsId' not in msg or 'timestamp' not in msg):
        return

    userId = msg['userId']
    newsId = msg['newsId']

    # Update user's preference
    db = mongodb_client.get_db()
    model = db[PREFERENCE_MODEL_TABLE_NAME].find_one({'userId': userId})
Ejemplo n.º 28
0
# 1st AMQP, stores news digest and url to scrape news body from
SCRAPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@otter.rmq.cloudamqp.com/xhzhqriu'
SCRAPE_NEWS_TASK_QUEUE_NAME = 'TopNewsTitleQueue'
# 2nd AMQP, stores news body for news deduper to consume
DEDUPE_NEWS_TASK_QUEUE_URL = 'amqp://*****:*****@otter.rmq.cloudamqp.com/xhzhqriu'
DEDUPE_NEWS_TASK_QUEUE_NAME = 'TopNewsQueue'

# interval between two processes
SLEEP_TIME_IN_SECONDS = 5
# put both queues to sleep in turn for a certain times
# total pause time (when no msg in Q) 2 * 45 * 10 = 900
PAUSE_INTERVAL_IN_EACH_LOOP = 45
LOOPS = 10

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL,
                                           DEDUPE_NEWS_TASK_QUEUE_NAME)
scrape_news_queue_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                           SCRAPE_NEWS_TASK_QUEUE_NAME)


# process msg: attach text field (news body), then push it to next AMQP
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print('message is broken')
        return

    task = msg

    article = Article(task['url'])
    article.download()
    article.parse()
Ejemplo n.º 29
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import news_api_client
from cloudAMQP_client import CloudAMQPClient

NEWS_TIME_OUT_IN_SECONDS = 3600 * 24 * 3
SLEEP_TIME_IN_SECONDS = 10

REDIS_HOST = 'localhost'
REDIS_PORT = 6379

SCRAPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@crane.rmq.cloudamqp.com/mcyrgohw"
SCRAPE_NEWS_TASK_QUEUE_NAME = "tap-news-scrape-news-task-queue"

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL,
                                   SCRAPE_NEWS_TASK_QUEUE_NAME)

NEWS_SOURCES = [
    'bbc-news', 'bbc-sport', 'bloomberg', 'cnn', 'entertainment-weekly',
    'espn', 'ign', 'techcrunch', 'the-new-york-times',
    'the-wall-street-journal', 'the-washington-post'
]

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)

    num_of_news_news = 0

    for news in news_list:
        news_digest = hashlib.md5(
            news['title'].encode('utf-8')).digest().encode('base64')
Ejemplo n.º 30
0
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
import time
from cloudAMQP_client import CloudAMQPClient  # pylint: disable=E0401
from dotenv import load_dotenv  # pylint: disable=E0401

dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env')
load_dotenv(dotenv_path)

MQ_CAMP_FETCHER_TASK_NAME = os.environ.get("MQ_CAMP_FETCHER_TASK_NAME")
MQ_CAMP_FETCHER_TASK_URI = os.environ.get("MQ_CAMP_FETCHER_TASK_URI")

fetch_camp_client = CloudAMQPClient(
    MQ_CAMP_FETCHER_TASK_URI, MQ_CAMP_FETCHER_TASK_NAME)

SLEEP_TIME_IN_SECONDS = 1

ROOT_URL = "http://find.acacamps.org/camp_profile.php?camp_id="


def build_camp_url(index):
    camp_url = '%s%s' % (ROOT_URL, str(index))
    return camp_url


def run():
    ''' loop camps url list from a index page and fetch each individual camp's URL as task to Message Queue '''
    for index in range(1, 4869):
        camp_url = build_camp_url(index)
        task = {'url': camp_url, 'camp_id': index}
Ejemplo n.º 31
0
# prepare logging
import logging
logging.basicConfig(filename='./logging/backend_server.log',
                    level=logging.INFO)

# ask server configs
server_config = config_service_client.getServerConfigForServer(
    'backend_server')
server_host = server_config['url']
server_port = int(server_config['port'])

# ask mq configs
mq_config = config_service_client.getMessagequeueConfigForUsecase(
    'log_clicks_task')
cloudAMQP_client = CloudAMQPClient(mq_config['queue_url'],
                                   mq_config['queue_name'])


class RequestHandler(pyjsonrpc.HttpRequestHandler):
    @pyjsonrpc.rpcmethod
    def getNewsSummariesForUser(self, user_id, page_num):
        logging.info('backend_server: %s asks for page %s' %
                     (user_id, page_num))
        return operations.getNewsSummariesForUser(user_id, page_num)

    @pyjsonrpc.rpcmethod
    def logNewsClickForUser(self, user_id, news_id, rate):
        logging.info('backend_server: %s rates %s for news: %s' %
                     (user_id, rate, news_id))
        return operations.logNewsClickForUser(user_id, news_id, rate)
Ejemplo n.º 32
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import zillow_web_scraper_client

from cloudAMQP_client import CloudAMQPClient

# RabbitMQ config
CLOUD_AMQP_URL = '''amqp://*****:*****@hyena.rmq.cloudamqp.com/kdflangt'''
DATA_FETCHER_QUEUE_NAME = 'dataFetcherTaskQueue'
ZIPCODE_FILE = 'san_diego_zipcode_list.txt'
SHUFFLE_ZIPCODES = True

WAITING_TIME = 3

cloudAMQP_client = CloudAMQPClient(CLOUD_AMQP_URL, DATA_FETCHER_QUEUE_NAME)

zipcode_list = []

with open(ZIPCODE_FILE, 'r') as zipcode_file:
    for zipcode in zipcode_file:
        zipcode_list.append(str(zipcode))

if SHUFFLE_ZIPCODES:
    print "shuffle zipcodes!"
    random.shuffle(zipcode_list)

for zipcode in zipcode_list:
    zpids = zillow_web_scraper_client.get_zpid_by_zipcode(zipcode)
    time.sleep(WAITING_TIME)
Ejemplo n.º 33
0
import os
import sys
from newspaper import Article
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import config_service_client
from cloudAMQP_client import CloudAMQPClient

# prepare logging
import logging
logging.basicConfig(filename='./logging/news_pipeline.log', level=logging.INFO)

# ask for mq configs
scrape_mq_config = config_service_client.getMessagequeueConfigForUsecase('scrape_news_task')
dedupe_mq_config = config_service_client.getMessagequeueConfigForUsecase('dedupe_news_task')
scrape_news_queue_client = CloudAMQPClient(scrape_mq_config['queue_url'], scrape_mq_config['queue_name'])
dedupe_news_queue_client = CloudAMQPClient(dedupe_mq_config['queue_url'], dedupe_mq_config['queue_name'])

# ask for other configs
fetch_config = config_service_client.getPipelineConfigForSection('news_fetcher')
scrape_sleeptime_seconds = int(fetch_config['scrape_queue_client_sleeptime_seconds'])
dedupe_sleeptime_seconds = int(fetch_config['dedupe_queue_client_sleeptime_seconds'])

def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message from news_to_scrape is broken'
        logging.error('news_fetcher: message from news_to_scrape is broken')
        return
    
    # use Newspaper to scrape the text of news
    task = msg
    text = None
Ejemplo n.º 34
0
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
from cloudAMQP_client import CloudAMQPClient


DEDUPE_NEWS_TASK_QUEUE_URL = "amqp://*****:*****@otter.rmq.cloudamqp.com/ncbqvnsd"
DEDUPE_NEWS_TASK_QUEUE_NAME = "pin_news_deduper"

SLEEP_TIME_IN_SECONDS = 1

SAME_NEWS_SIMILARITY_THRESHOLD = 0.9

NEWS_TABLE_NAME = "pin_news"

dedupe_news_queue_client = CloudAMQPClient(DEDUPE_NEWS_TASK_QUEUE_URL, DEDUPE_NEWS_TASK_QUEUE_NAME)

def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return
    task = msg
    content = task['content']
    if content is None:
        return

    # Get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day - 1,
                                               0, 0, 0, 0)
Ejemplo n.º 35
0
NEWS_SOURCES = [
    'bbc-news',
    'bbc-sport',
    'bloomberg',
    'cnn',
    'entertainment-weekly',
    'espn',
    'ign',
    'techcrunch',
    'the-new-york-times',
    'the-wall-street-journal',
    'the-washington-post'
]

redis_client = redis.StrictRedis(REDIS_HOST, REDIS_PORT)
cloudAMQP_client = CloudAMQPClient(SCRAPE_NEWS_TASK_QUEUE_URL, SCRAPE_NEWS_TASK_QUEUE_NAME)

while True:
    news_list = news_api_client.getNewsFromSource(NEWS_SOURCES)
    num_of_new_news = 0
    for news in news_list:
        news_digest = hashlib.md5(news['title'].encode('utf-8')).digest().encode('base64')

        if redis_client.get(news_digest) is None:
            num_of_new_news = num_of_new_news + 1
            news['digest'] = news_digest

            if news['publishedAt'] is None:
                # format: YYYY-MM-DDTHH:MM:SS in UTC
                news['publishedAt'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
Ejemplo n.º 36
0
import sys
import redis
import hashlib
import datetime
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import news_api_client
import config_service_client
from cloudAMQP_client import CloudAMQPClient

# prepare logging
import logging
logging.basicConfig(filename='./logging/news_pipeline.log', level=logging.INFO)

# ask for mq and memory configs
mq_config = config_service_client.getMessagequeueConfigForUsecase('scrape_news_task')
cloudAMQP_client = CloudAMQPClient(mq_config['queue_url'], mq_config['queue_name'])

mmr_config = config_service_client.getMemoryConfig('redis')
redis_client = redis.StrictRedis(mmr_config['host'], mmr_config['port'])

# ask for other params
news_monitor_config = config_service_client.getPipelineConfigForSection('news_monitor')
news_sources = news_monitor_config['news_sources']
news_timeout_seconds = int(news_monitor_config['news_timeout_seconds'])
sleeptime_seconds = int(news_monitor_config['scrape_queue_client_sleeptime_seconds'])

while True:
    # such a step takes a list of latest news task, but most of them could be old duplicates
    news_list = news_api_client.getNewsFromSource(news_sources)
    num_of_new_news = 0