def __init__(self, ip_addr, kafka_config_infile, s3bucket_config_infile): if not os.path.exists('./tmp'): os.makedirs('./tmp') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='./tmp/kafka_producer.log', filemode='w') self.logger = logging.getLogger('py4j') self.kafka_config = helpers.parse_config(kafka_config_infile) self.s3bucket_config = helpers.parse_config(s3bucket_config_infile) self.producer = KafkaProducer(bootstrap_servers=ip_addr)
def __init__(self, kafka_configfile, schema_file, s3_configfile): """ class constructor that initializes the instance according to the configurations of the S3 bucket and Kafka :type kafka_configfile: str path to kafka config file :type schema_file : str path to schema file :type s3_configfile : str path to S3 config file """ self.kafka_config = helpers.parse_config(kafka_configfile) self.schema = helpers.parse_config(schema_file) self.s3_config = helpers.parse_config(s3_configfile) self.producer = KafkaProducer( bootstrap_servers=self.kafka_config["BROKERS_IP"])
def __init__(self, s3_configfile, schema_configfile, psql_configfile): """ class constructor that initializes the instance according to the configurations of the S3 bucket, raw data and PostgreSQL table :type s3_configfile: str path to s3 config file :type schema_configfile: str path to schema config file :type psql_configfile: str path to psql config file """ self.s3_config = helpers.parse_config(s3_configfile) self.schema = helpers.parse_config(schema_configfile) self.psql_config = helpers.parse_config(psql_configfile) self.sc = pyspark.SparkContext.getOrCreate() self.sc.setLogLevel("ERROR")
def test_parse_config(self): # test if correctly parses the config file conf = {"field1": "val1", "field2": {"subfield1": 2, "subfield2": "3"}} with patch("__builtin__.open", mock_open(read_data=json.dumps(conf))) as mock_file: self.assertEqual(conf, helpers.parse_config(mock_file), "fail to properly read config from file")
def __init__(self, kafka_configfile, schema_configfile, stream_configfile, start_offset): """ class constructor that initializes the instance according to the configurations of Kafka (brokers, topic, offsets), data schema and batch interval for streaming :type kafka_configfile: str path to s3 config file :type schema_configfile: str path to schema config file :type stream_configfile: str path to stream config file :type start_offset: int offset from which to read from partitions of Kafka topic """ self.kafka_config = helpers.parse_config(kafka_configfile) self.stream_config = helpers.parse_config(stream_configfile) self.schema = helpers.parse_config(schema_configfile) self.start_offset = start_offset self.sc = pyspark.SparkContext().getOrCreate() self.ssc = pyspark.streaming.StreamingContext( self.sc, self.stream_config["INTERVAL"]) self.sc.setLogLevel("ERROR")
def __init__(self, kafka_config_infile, ecg_spark_config_infile, postgres_config_infile, s3bucket_config_infile, batch_interval): if not os.path.exists('./tmp'): os.makedirs('./tmp') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='./tmp/spark_consumer.log', filemode='w') self.logger = logging.getLogger('py4j') self.logger.setLevel(logging.WARN) self.ecg_spark_config = helpers.parse_config(ecg_spark_config_infile) self.postgres_config = helpers.parse_config(postgres_config_infile) self.s3bucket_config = helpers.parse_config(s3bucket_config_infile) self.kafka_config = helpers.parse_config(kafka_config_infile) self.sc = SparkContext(appName='ECGDashboardApp') self.sc.setLogLevel("FATAL") self.ssc = StreamingContext(self.sc, batch_interval) self.logger.warn('Opened spark Context') self.kafkastream = self.connectToKafkaBrokers() self.logger.warn('Opened connection to Kafka brokers') self.a = self.sc.accumulator(0)
def __init__(self, postgres_config_infile): if not os.path.exists('./tmp'): os.makedirs('./tmp') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='./tmp/website.log', filemode='w') self.logger = logging.getLogger('py4j') self.postgres_config = helpers.parse_config(postgres_config_infile) self.cur = self.connectToDB() self.signal_schema = [ 'batchnum', 'signame', 'time', 'ecg1', 'ecg2', 'ecg3' ] self.hr_schema = ['batchnum', 'signame', 'time', 'hr1', 'hr2', 'hr3']
def __init__(self, subreddit, output_file_name=paths.OUTPUT_FILE): super().__init__() self.filter = helpers.Filter(watchlist=True) self.session = Session() self.reddit = self.init_connection() self.subreddit = subreddit self.output_file_name = output_file_name self.watchlist = helpers.parse_config("watchlist") self.watchlist_hits = [] self.submissions = [] self.subs_100_off = [] self.subs_90_off = [] self.subs_80_off = [] self.subs_70_off = [] self.subs_60_off = [] self.subs_50_off = [] self.subs_40_off = [] self.subs_30_off = [] self.subs_20_off = [] self.subs_10_off = []
def __init__(self, kafka_configfile, schema_configfile, stream_configfile, psql_configfile, start_offset=0): """ class constructor that initializes the instance according to the configurations of Kafka (brokers, topic, offsets), PostgreSQL database, data schema and batch interval for streaming :type kafka_configfile: str path to s3 config file :type schema_configfile: str path to schema config file :type stream_configfile: str path to stream config file :type psql_configfile: str path to psql config file :type start_offset: int offset from which to read from partitions of Kafka topic """ SparkStreamerFromKafka.__init__(self, kafka_configfile, schema_configfile, stream_configfile, start_offset) self.psql_config = helpers.parse_config(psql_configfile) self.sqlContext = pyspark.sql.SQLContext(self.sc) self.load_batch_data() self.psql_n = 0
from flask import Flask, jsonify, request, send_file import ast import os import sys from pymongo import DESCENDING if os.getcwd().split('/')[-1] != 'fl_sport_betting': os.chdir("..") sys.path.append(os.path.abspath(os.curdir)) from storage.mongodb_storage import MongoDBStorage as mdb import helpers config = helpers.parse_config('server') client = mdb().client app = Flask(__name__) def group_forecasts(forecasts): grouped = {} for forecast in forecasts: grouped.setdefault(forecast['forecast_type'], []) grouped[forecast['forecast_type']].append(forecast) return grouped def get_file_link(resource, file_name): pattern = '{server_domain}/api/files/{resource}/{file_name}' file_link = pattern.format(server_domain=config['domain'], resource=resource, file_name=file_name) return file_link
topic=cfg['kafka_broker']['producer_topic_deals'], message=json): influxdb_client.write_deal() if __name__ == '__main__': parser = argparse.ArgumentParser() env_list = ['dev', 'sit', 'prd'] parser.add_argument('-env', choices=env_list, metavar=str(env_list), required=True, help='parser environment type') # Парсинг аргументов args = vars(parser.parse_args()) env_type = args['env'] root_path = os.path.dirname(os.path.realpath(__file__)) cfg = helpers.parse_config(env_type=env_type, root_path=root_path) logger = helpers.create_timed_rotating_log(cfg=cfg, root_path=root_path) start_time = datetime.combine(datetime.now().date(), time(4, 0, 0)) if datetime.now() < start_time: logger.info('service will be starting at 04:00:00') t.sleep((start_time - datetime.now()).seconds) read_from_efx()
import os import sys sys.path.append('../python') import boto3 import helpers # TODO: Fix brokers unavailable issue now switched over the confluent-kafka. session = 'k1' kafka_config = helpers.parse_config('../../.config/kafka.config') s3bucket_config = helpers.parse_config('../../.config/s3bucket.config') ipaddr = kafka_config['ip-addr'].split(',') s3 = boto3.client('s3') obj = s3.get_object(Bucket="testsmalldata", Key="RECORDS_abridged.txt") records = obj['Body'].read().decode('utf-8').split('\n') records_per_node = int(round(len(records) / len(ipaddr))) print(records_per_node) #Open x number of file threads on y nodes. Visualized using tmux. os.system('tmux kill-session -t %s' % session) os.system('tmux new-session -s %s -n bash -d' % session) for i in range(len(ipaddr)): start = i * records_per_node if i == len(ipaddr): stop = len(records) - 1 else: stop = (i + 1) * records_per_node ip = ipaddr[i] records_interval = records[start:stop] os.system('echo %s' % ip)
from datetime import datetime, timedelta from airflow import DAG from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator import psycopg2 import pandas as pd import boto3 from StringIO import StringIO sys.path.append('../../python/') import helpers s3bucket_config_infile = '../../../.config/s3bucket.config' postgres_config_infile = '../../../.config/postgres.config' s3bucket_config = helpers.parse_config(s3bucket_config_infile) postgres_config = helpers.parse_config(postgres_config_infile) schema = ['id', 'batchnum', 'signame', 'time', 'ecg1', 'ecg2', 'ecg3'] def connectToDB(postgres_config): """ :return: database cursor """ try: conn = psycopg2.connect(host=postgres_config['host'], database=postgres_config['database'], port=postgres_config['port'], user=postgres_config['user'], password=postgres_config['password'])
sys.path.append("/home/ubuntu/TaxiOptimizer/helpers/") import os import time, json from app import app from datetime import datetime from flask import jsonify, render_template, request import helpers from math import floor from more_itertools import peekable import psycopg2 import random # configure connection string for PostgreSQL database app.dbconfig = helpers.parse_config('/home/ubuntu/TaxiOptimizer/config/postgresql.config') app.conn_str = "host='%s' dbname='%s' user='******' password='******'" % (app.dbconfig["host"], app.dbconfig["dbname"], app.dbconfig["user"], app.dbconfig["password"]) # set default vehicle_id and the list of coordinates to display app.vid = [] app.res = [] app.coords = [] # time will be in the range from 10am to 10pm app.curtime = 600 def print_time(t):