def run_ds_pipeline(config): #get logger object, probably already created logger = sa_logger.init(globals.PACKAGE_NAME) logger.info('Data science pipeline begining...') #initialize db dbif.db_init() if config['action']['ingest']['make_list'] == True: #begin with data ingestion ingest.ingest_data(config) else: print 'skipping ingestion...' if config['action']['wrangle']['transform'] == True: #time for data wrangling wrangle.wrangle_data(config) else: logger.info('skipping wrangling and insertion of data into db ...') if config['action']['analyze']['analyze'] == True: #time for data analysis analyze.analyze_data(config) else: logger.info('skipping analysis ...') if config['action']['visualize']['visualize'] == True: #time for data visualization visualize_and_analyze.visualize_data(config) else: logger.info('skipping visualization ...')
import va_utils import numpy as np import pandas as pd import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from scipy import stats import matplotlib.pyplot as plt import statsmodels.api as sm from sklearn import linear_model #global varialbes for this file logger = sa_logger.init(globals.PACKAGE_NAME) OUTPUT_DIR_NAME = "output" def do_linear_regression(file_name): try: logger.info('-------------------------------------') logger.info('TCP Vs UDP linear regression.........') logger.info('-------------------------------------') df = pd.read_csv(file_name) #plot TCP Vs UDP, so UDP goes on x axis x = np.array(df['UDP']) X = x[:, np.newaxis] y = np.array(df['TCP']) #create a linear regressor