Exemple #1
0
from celery_base import task
from random import random
from docker_logs import get_logger

logging = get_logger("runner")

result = task.delay(random()).get(timeout=10)

logging.info(f"Task returned: {result}")
Exemple #2
0
import os
import praw
from pymagnitude import Magnitude
from celery import Celery
from influxdb import InfluxDBClient

from docker_logs import get_logger

logging = get_logger("celery-base")

app = Celery()
app.conf.update({
    'task_routes': {
        'get_subreddit': {
            'queue': 'scraper'
        },
        'get_submission': {
            'queue': 'scraper'
        },
        'put_embeddings': {
            'queue': 'embedder'
        },
        'send_to_mongo': {
            'queue': 'mongo'
        }
    },
    'task_serializer': 'pickle',
    'result_serializer': 'pickle',
    'accept_content': ['pickle']
})
Exemple #3
0
"""Embedding worker."""
from celery import Celery
from docker_logs import get_logger
from mongodb_worker import save_submission

import fasttext
from nltk.tokenize import WordPunctTokenizer
import numpy as np

logging = get_logger("embedder")
app = Celery('celery_base', broker='amqp://localhost//', backend='amqp')

tokenizer = WordPunctTokenizer()
model_name = 'dbpedia.bin'
ft_model = fasttext.load_model(model_name)


@app.task(bind=True)
def embedding(self, work, submissions):
    """Embedds given submissions texts."""
    total = sum([len(s["comments"]) for s in submissions])
    logging.info(f'{work}: Submissions {len(submissions)} embedded'
                 f' with total {total} comments')
    for submission in submissions:
        subm_vectors = [
            ft_model[token] for token in tokenizer.tokenize(submission['text'])
        ]
        for comment in submission['comments']:
            subm_vectors.extend(
                [ft_model[token] for token in tokenizer.tokenize(comment)])
        if len(subm_vectors) > 0:
Exemple #4
0
import os

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator, \
    MulticlassClassificationEvaluator

from docker_logs import get_logger
from utils import get_class_distribution
from pipelines import get_linear_regression_pipeline, \
    get_binary_classification_pipeline, get_multi_classification_pipeline


logger = get_logger("app-spark")

conf = SparkConf().setAppName('app-spark').setMaster('local')
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

spark = SparkSession \
    .builder \
    .appName("app-spark") \
    .getOrCreate()

mongo_uri = ''.join([
    'mongodb://',
    os.environ['MONGO_INITDB_ROOT_USERNAME'],
    ':',
    os.environ['MONGO_INITDB_ROOT_PASSWORD'],
"""DB worker task."""
from worker import app
from docker_logs import get_logger
from pymongo import MongoClient

logging = get_logger("mongo_db_task")
logging.propagate = False
client = MongoClient('mongodb:27017')
db = client.tweetmldb


@app.task(bind=True, name='mongo_task', queue='mongo')
def mongo_task(self, collection):
    """Saves new tweets in mongo db."""
    logging.info(f"DB ISERTION FIRED ")
    if len(collection) > 0:
        db.posts.insert_many(convert_objects_to_dicts(collection))
    logging.info(f"DB SIZE: {db.posts.count()} ")
    # client.close()


def convert_objects_to_dicts(collection):
    """Converts objects to dictionaries."""
    results = []
    for tweet in collection:
        results.append(tweet.__dict__)
    return results
"""Embedding worker."""
from worker import app
from docker_logs import get_logger
from pymagnitude import Magnitude
import gensim
import numpy as np
from mongo_task import mongo_task

logging = get_logger("embedding_task")
logging.propagate = False


def preproces_text(text):
    """Text splitted to tokens."""
    return gensim.utils.simple_preprocess(text)


def get_sentences_representation(vectors, splitted_sentence):
    """Counts average embedding."""
    length = 0
    av_sum = np.zeros(shape=(100, ))
    for i in range(len(splitted_sentence)):
        if splitted_sentence[i] in vectors:
            av_sum = av_sum + vectors.query(splitted_sentence[i])
            length += 1
    if length > 0:
        av_sum = av_sum / length
    return av_sum


def get_text_embedding(vectors, text):
Exemple #7
0
from celery import Celery
from docker_logs import get_logger
logging = get_logger("task")

app = Celery()

@app.task(bind=True, name='task')  
def task(self, param):  
    logging.info(f"Celery task executed with param: {param}")
    return f"Result of task for param {param}"
  
Exemple #8
0
"""Sample pySpark app."""

from docker_logs import get_logger

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.ml.feature import QuantileDiscretizer

from logistic_regression import logistic_regression
from binary_classification import binary_classification
from multi_class_classification import multi_class_classification

logging = get_logger('spark_worker')

spark = SparkSession.builder.appName('MyModels').\
    config('spark.mongodb.input.uri',
           'mongodb://mongodb:27017/reddits.submissions').\
    getOrCreate()


def load(dev):
    """Loads the submissions from MongoDB database."""
    logging.info('Loading submissions...')
    df = dev.read.format('com.mongodb.spark.sql.DefaultSource').load()

    df.createOrReplaceTempView('submissions')
    df.printSchema()

    query = 'select score, upvote_ratio, is_nfsw, text_embedded from \
Exemple #9
0
"""Mongodb management worker."""
import os
from celery import Celery
from docker_logs import get_logger
from pymongo import MongoClient

logging = get_logger("mongodb_worker")
app = Celery('celery_base', broker='amqp://localhost//', backend='amqp')
mongo_client = MongoClient(host=os.environ['MONGODB_HOST'],
                           port=int(os.environ['MONGODB_PORT']))


@app.task(bind=True)
def save_submission(self, work, submission):
    """Saves submission do Mongo database."""
    try:
        db = mongo_client.reddits
        col = db.submissions
        s_id = col.insert(submission)
        logging.info(f'{work}: Submission saved into MongoDB: {s_id}')
    except Exception as e:
        logging.error(f'{work}: MongoDB saving error: {e}')
Exemple #10
0
import time
from datetime import datetime, timedelta
import os

import influxdb
import prawcore
from celery import signature

from celery_base import influxdb_client, reddit, app
from docker_logs import get_logger
from data_models import RedditSubmission

logging = get_logger("worker-scraper")


@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
    sender.add_periodic_task(int(os.environ['frequency_s']),
                             get_subreddit.s(),
                             name='subreddits')


@app.task(bind=True, name='get_subreddit')
def get_subreddit(self):
    subreddit_name = os.environ['subreddit']
    time_diff = int(os.environ['frequency_s'])
    current_time = datetime.utcnow()
    time_lower_bound = current_time - timedelta(seconds=time_diff)

    new_submissions = []
    json_metrics = []
Exemple #11
0
"""Test worker task. Will be removed."""
import requests
from worker import app
from docker_logs import get_logger

logging = get_logger("time_log_task")
logging.propagate = False


@app.task(bind=True, name='time_task', queue='time')
def scrap_tweets_from_location(self):
    """Logs time."""
    r = requests.get('http://webapp:5000/since')
    logging.info(f"CURRENT TIME {r.text} ")
Exemple #12
0
from pymagnitude import Magnitude
import numpy as np

from celery_base import app
from data_models import RedditSubmission
from docker_logs import get_logger

logger = get_logger("worker-embedder")


@app.task(bind=True, name='put_embeddings')
def put_embeddings(self, rSubmission: RedditSubmission):
    vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300')

    rSubmission.post_title_embedding = np.mean(vecs.query(
        rSubmission.post_title.split()),
                                               axis=0)

    if len(rSubmission.post_text) > 0:
        rSubmission.post_text_embedding = np.mean(vecs.query(
            rSubmission.post_text.split()),
                                                  axis=0)

    return rSubmission
Exemple #13
0
import os
from dataclasses import asdict

import pymongo

from celery_base import app
from data_models import RedditSubmission
from docker_logs import get_logger

logging = get_logger("worker-mongo")

mongo_uri = ''.join([
    os.environ['mongodb_protocol'], '://',
    os.environ['MONGO_INITDB_ROOT_USERNAME'], ':',
    os.environ['MONGO_INITDB_ROOT_PASSWORD'], '@', os.environ['mongodb_host'],
    ':', os.environ['mongodb_port']
])
# https://pymongo.readthedocs.io/en/stable/faq.html#is-pymongo-fork-safe
# myclient = pymongo.MongoClient(mongo_uri)
# mydb = myclient["reddit"]
# mycol = mydb["submissions"]


@app.task(bind=True, name='send_to_mongo')
def send_to_mongo(self, rSubmission: RedditSubmission):

    myclient = pymongo.MongoClient(mongo_uri)
    mydb = myclient["reddit"]
    mycol = mydb["submissions"]

    rSubmission.post_title_embedding = list(
Exemple #14
0
"""Scheduling worker."""
import os
from celery_base import app
from docker_logs import get_logger

logging = get_logger("worker")

app.conf.beat_schedule = {
    'work_every_n_minutes_new': {
        'task':
        'celery_base.submissions',
        'schedule':
        float(os.environ['CELERYBEAT_MINUTES_INTERVAL']) * 60.0,
        'args': ('worker', 'AskReddit', 50, 'new',
                 float(os.environ['CELERYBEAT_MINUTES_INTERVAL']) * 60.0)
    }
}
app.conf.timezone = 'UTC'