Ejemplo n.º 1
0
def to_redshift(data_frame):
    #date_str =data_base.strftime('%Y/%m/%d')
    print('connect_to_redshift')
    pr.connect_to_redshift(
        dbname='pricing',
        host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com',
        port='5432',
        user='******',
        password='******')
    print('connect_to_s3')
    pr.connect_to_s3(
        aws_access_key_id="AKIAILQVO2DJQHRLFLQQ",
        aws_secret_access_key="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",
        bucket="kroton-analytics",
        subdirectory="raw/uploads/esteira")
    print('pandas_to_redshift')
    pr.pandas_to_redshift(
        data_frame=data_frame,
        ##columns_data_types = ['real_mes' float64, 'real_ano_anterior' float64, 'acumulado_ano' float64, 'meta_ano' float64],
        ##column_data_types=['VARCHAR(250)','DATE','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','VARCHAR(250)','FLOAT','FLOAT','FLOAT','FLOAT'],
        index=False,
        ##column_data_types = None,
        redshift_table_name='kroton_pricing.tb_dev_bd_ccr',
        append=True)
    return print('fim save_to_redshift')
Ejemplo n.º 2
0
def to_redshift():
    pr.connect_to_redshift(
        dbname='pricing',
        host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com',
        port='5432',
        user='******',
        password='******')
    print('Connect to RedShift')
    return pr.redshift_to_pandas('select * from kroton_pricing.bd_ccr')
Ejemplo n.º 3
0
def rs_data_select(query):
    pr.connect_to_redshift(dbname=DBNAME,
                           host=HOST,
                           port=PORT,
                           user=RS_ID,
                           password=RS_PW)
    df = pr.redshift_to_pandas(query)
    pr.close_up_shop()
    df = df.round(2)
    return df
Ejemplo n.º 4
0
def connect_to_redshift():
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    #Connect to Redshift
    pr.connect_to_redshift(dbname = 'muni',
                        host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
                        port = '5439',
                        user = credentials['user'],
                        password = credentials['password'])
Ejemplo n.º 5
0
def connect_to_redshift():
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    #Connect to Redshift
    pr.connect_to_redshift(
        dbname='act',
        host='ac-transit.cupreqhyyagj.us-west-1.redshift.amazonaws.com',
        port='5439',
        user=credentials['user'],
        password=credentials['password'])
Ejemplo n.º 6
0
    def connect_aws(self):
        """ Connect to AWS using pandas_redshift """
        pr.connect_to_s3(aws_access_key_id = s3_credentials_AWS_ACCESS_KEY,
                         aws_secret_access_key = s3_credentials_AWS_SECRET_KEY,
                         bucket = s3_credentials_BUCKET,
                         subdirectory = BUCKET_FOLDER)

        pr.connect_to_redshift(dbname    = redshift_credentials_dbname,
                                host     = redshift_credentials_host,
                                port     = redshift_credentials_port,
                                user     = redshift_credentials_user,
                                password = redshift_credentials_password)
Ejemplo n.º 7
0
def db_pandas_query(query):
    """
    Read Redshift table into a pandas data frame
    """
    pr.connect_to_redshift(dbname=DB_NAME,
                           host=DB_HOST,
                           port=DB_PORT,
                           user=DB_USER,
                           password=DB_PASSWORD)
    data = pr.redshift_to_pandas(query)
    pr.close_up_shop()
    return data
Ejemplo n.º 8
0
def get_distributions():
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_redshift(dbname = 'muni',
                        host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
                        port = '5439',
                        user = credentials['user'],
                        password = credentials['password'])

    df = pr.redshift_to_pandas("""select *,
                                    convert_timezone('US/Pacific', departure_time_hour) as local_departure_time_hour
                                     from distributions_gamma""")
    pr.close_up_shop()

    return df
Ejemplo n.º 9
0
def get_raw(sample_flag):
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_redshift(dbname = 'muni',
                        host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
                        port = '5439',
                        user = credentials['user'],
                        password = credentials['password'])

    if sample_flag:
        df = pr.redshift_to_pandas("""select * from vehicle_monitoring limit 1000""")
        df.to_csv('data/vehicle_monitoring_sample.csv', index=False)
    else:
        df = pr.redshift_to_pandas("""select * from vehicle_monitoring""")
        df.to_csv('data/vehicle_monitoring.csv', index=False)
    pr.close_up_shop()
    return df
Ejemplo n.º 10
0
def dbconnect(df):
    dbname = os.getenv('REDSHIFT_DB')
    host = os.getenv('REDSHIFT_HOST')
    port = os.getenv('REDSHIFT_PORT')
    user = os.getenv('REDSHIFT_USER')
    password = os.getenv('REDSHIFT_PASS')

    pr.connect_to_redshift(dbname=dbname,
                           host=host,
                           port=port,
                           user=user,
                           password=password)

    pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'),
                     aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'),
                     bucket='TODO')
    pr.pandas_to_redshift(data_frame=df,
                          redshift_table_name='weather_data',
                          append=True)
Ejemplo n.º 11
0
def get_distributions(sample_flag):
    with open('credentials.json') as json_data:
        credentials = json.load(json_data)

    pr.connect_to_redshift(
        dbname='muni',
        host='jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com',
        port='5439',
        user=credentials['user'],
        password=credentials['password'])

    if sample_flag:
        df = pr.redshift_to_pandas(
            """select departure_time_hour, departure_stop_id, arrival_stop_id, shape, scale, shape*scale as mean
                                        from distributions_gamma limit 1000""")
        df.to_csv('data/distributions_gamma_sample.csv', index=False)
    else:
        df = pr.redshift_to_pandas(
            """select departure_time_hour, departure_stop_id, arrival_stop_id, shape, scale, shape*scale as mean
                                        from distributions_gamma""")
        df.to_csv('data/distributions_gamma.csv', index=False)
    pr.close_up_shop()
    return df
DOWNTIMEDOWNTIMEfrom flask import Flask, jsonify, json
import pandas as pd
import pandas_redshift as pr

pr.connect_to_redshift(dbname = 'habladb',
                        host = 'habla-ai.csvoexx0fghm.us-west-2.redshift.amazonaws.com',
                        port = 5439,
                        user = '******',
                        password = '******')


# MEAN UPTIME AND DOWNTIME ACROSS MULTIPLE PLANTS
LAMB_WESTON_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.lbw_mean_uptime t')
LAMB_WESTON_DOWNTIME = pr.redshift_to_pandas('SELECT t.* FROM public.lbw_mean_downtime t')


# PASCO UPTIME BY DAY BY LINE
PASCO_L1_S6_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s6_up_groupby t')
PASCO_L1_S7_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s7_up_groupby t')
PASCO_L1_S8_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s8_up_groupby t')
PASCO_L1_S9_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s9_up_groupby t')
PASCO_L1_S10_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s10_up_groupby t')
PASCO_L2_S1_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s1_up t')
PASCO_L2_S2_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s2_up t')
PASCO_L2_S3_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s3_up t')
PASCO_L2_S4_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s4_up t')
PASCO_L2_S5_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s5_up t')

# PASCO REASON LEVEL 1
PASCO_L1_S6_RLVL1 = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_rlv1_l1s6 t')
PASCO_L1_S7_RLVL1 = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_rlv1_l1s7 t')
Ejemplo n.º 13
0
# Databricks notebook source
import pandas_redshift as pr
import pandas as pd

# COMMAND ----------

pr.connect_to_redshift(dbname = 'pso',
                        host = '',
                        port =5439,
                        user = '',
                        password = '')

# COMMAND ----------



# COMMAND ----------

 sql="""copy {}.{} from '{}'\
        credentials \
        'aws_access_key_id={};aws_secret_access_key={}' \
         timeformat 'YYYY-MM-DD HH:MI:SS'
        DELIMITER '\t' lzop;commit;"""\
        .format('public', 'mm_impressions_sh_test', 's3://mm-prod-platform-impressions/data/organization_id=100977/impression_date=2018-07-25/', '{{access_key}}', '{{secret}}')

# COMMAND ----------

from sqlalchemy import create_engine
conn = create_engine('postgresql://*****:*****@host:5439/pso')

try:
Ejemplo n.º 14
0
                                   path_file=obj_name)
    redshift_tables.append(last)
    print(
        str(
            repos.compile(dialect=RedshiftDialect(),
                          compile_kwargs={'literal_binds': True})))
    print(dir(repos))

    connection = engine.connect()
    connection.execute(repos.execution_options(autocommit=True))
    connection.close()
    return file_json

pr_redshift = pr.connect_to_redshift(
    dbname='dev',
    host='redshift-cluster-1.cug5ajtfsvsw.us-west-2.redshift.amazonaws.com',
    port=5439,
    user='******',
    password='******')

for table in redshift_tables:
    data = pr.redshift_to_pandas('select * from table')
    data_ = data['data']


def get_modules_and_for_position(file):
    imports = []

    #Get all imported Modules

    result = re.findall(r"(?<!from)import (\w+)[\n.]|from\s+(\w+)\s+import",
                        file)
Ejemplo n.º 15
0
##difference
#for i in tqdm(reversed(range(len(li)))):
#    if i != 0:
#        li[i][9] = li[i][9] - li[i-1][9]
#        li[i][10] = li[i][10] - li[i-1][10]

df3 = pd.concat(li) #concat li. df3 is final dataframe
df3 = df3[[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17]]
df3.columns = ['date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd', 'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2', 'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari']

dbname = os.getenv('REDSHIFT_DB')
host = os.getenv('REDSHIFT_HOST')
port = os.getenv('REDSHIFT_PORT')
user = os.getenv('REDSHIFT_USER')
password = os.getenv('REDSHIFT_PASS')

pr.connect_to_redshift(dbname = dbname,
                        host = host,
                        port = port,
                        user = user,
                        password = password)

pr.connect_to_s3(aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID'),
                aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY'),
                bucket = 'nichiji-tmp'
                )

pr.pandas_to_redshift(data_frame = df3,
                        redshift_table_name = 'jisseki_nichiji',
                        append = True)
## basic params
local_file = mongo_file
bucket_name = 'worthy-etl'
s3_file_name = 'mirros/mongo_decision.csv'

#Obtaining the connection to RedShift
con = psycopg2.connect(
    dbname='dwh_dev',
    host='redshift-cluster-1.coczxpfcmk83.us-east-1.redshift.amazonaws.com',
    port='5439',
    user=username_red,
    password=password_red)
con_red = pr.connect_to_redshift(
    dbname='dwh_dev',
    host='redshift-cluster-1.coczxpfcmk83.us-east-1.redshift.amazonaws.com',
    port='5439',
    user=username_red,
    password=password_red)

# read csv file and build table in redshift according to its structure
## open file get header and make create table statement
f = open(mongo_file, 'r')
reader = csv.reader(f, delimiter=';')
longest, headers, type_list = [], [], []

table_name = 'mrr_mongo_decisions_tag'
statement_drop = 'drop table ' + table_name
statement_create = 'create table ' + table_name + ' ('


## update log table start
Ejemplo n.º 17
0
import pandas as pd
import pandas_redshift as pr

#connect to redshift
###this code calculates number of trips for each route and process the file one at a time and also most used routes. citibike data is public
pr.connect_to_redshift(dbname = 'production',
                        host ='localhost',
                        port ='5439',
                        user ='******',
                        password ='******')

# Connect to S3
pr.connect_to_s3(aws_access_key_id = 'sadadasfaftew',
                aws_secret_access_key = 'ewwet4tsdvsrvrvrervwef',
                bucket = 'data-science',
                subdirectory = 'shwangdir'
                )

#upload a copy to S3 and redshift

#for i in range(5):
    #url = 'https://s3.amazonaws.com/tripdata/20180{}-citibike-tripdata.csv.zip'.format(i+1)
df = pd.read_csv('/Users/ankitkumar/Downloads/201801-citibike-tripdata.csv')
print(df)
pr.pandas_to_redshift(data_frame = df,
                      redshift_table_name = 'analytics.trip_fact')

dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \
  .sort_values(ascending=False) \
  .reset_index(name='count'))
Ejemplo n.º 18
0
 sku_cd    | character varying(256) |           |          |
 dpt_cd    | character varying(256) |           |          |
 line_cd   | character varying(256) |           |          |
 class_cd  | character varying(256) |           |          |
 sku_name  | character varying(256) |           |          |
 urisu     | numeric(10,2)          |           |          |
 urikin    | numeric(10,2)          |           |          |
 gsagsu1   | numeric(10,2)          |           |          |
 gsaggk1   | numeric(10,2)          |           |          |
 gsagsu2   | numeric(10,2)          |           |          |
 gsaggk2   | numeric(10,2)          |           |          |
 gsagsu3   | numeric(10,2)          |           |          |
 gsaggk3   | numeric(10,2)          |           |          |
 garari    | numeric(10,2)          |           |          |
"""

pr.connect_to_redshift(dbname=os.getenv('REDSHIFT_DB'),
                       host=os.getenv('REDSHIFT_HOST'),
                       port=os.getenv('REDSHIFT_PORT'),
                       user=os.getenv('REDSHIFT_USER'),
                       password=os.getenv('REDSHIFT_PASS'))

pr.connect_to_s3(aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
                 aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
                 bucket='jisseki-nichiji')

data = {"date_ymdh": ["2018063012"], "ten_cd": ["0001"]}
f = pd.DataFrame(data)

pr.pandas_to_redshift(f, 'jisseki_nichiji')
Ejemplo n.º 19
0
FILE_NAME = BUCKET_FOLDER + "/" + REDSHIFT_TABLE_NAME + "_" + DATE_NOW + ".csv"
s3 = session.resource('s3')
bucket = s3.Bucket(s3_credentials_BUCKET)
s3.Object(s3_credentials_BUCKET, FILE_NAME).put(Body=csv_buffer.getvalue())

print('3. Connect to S3')
pr.connect_to_s3(aws_access_key_id=s3_credentials_AWS_ACCESS_KEY,
                 aws_secret_access_key=s3_credentials_AWS_SECRET_KEY,
                 bucket=s3_credentials_BUCKET,
                 subdirectory=BUCKET_FOLDER)

print('4. Connect to Redshift')
pr.connect_to_redshift(dbname=redshift_credentials_dbname,
                       host=redshift_credentials_host,
                       port=redshift_credentials_port,
                       user=redshift_credentials_user,
                       password=redshift_credentials_password)


def delete_from_date(date):
    """ Delete data from table """
    query = "DELETE FROM {table} WHERE date_sent >= '{datef}'".format(
        table=REDSHIFT_TABLE_NAME, datef=date)
    print("PRINT SQL STATEMENT: ", query)
    pr.exec_commit(query)
    return None


delete_from_date(date='2018-01-01')
Ejemplo n.º 20
0
def cleansing_format_data(date):
    colss_li = [i for i in range(0, 211)]
    del colss_li[11:21]
    colss_li.remove(2)
    colss_li.remove(7)

    response = s3client.list_objects(Bucket='ld-rawdata',
                                     Prefix='TR_JISSEKI/' + date + 'XXXXXX/')

    if 'Contents' in response:
        keys = [content['Key'] for content in response['Contents']]
        key = keys[-1]  #23時のデータ

    bucket_name = 'ld-rawdata'
    file_name = key
    day = file_name[37:45]  #day string
    reader = pd.read_csv('s3n://' + bucket_name + '/' + file_name,
                         encoding="cp932",
                         header=None,
                         iterator=True,
                         chunksize=1000,
                         usecols=colss_li)
    df = pd.concat((r for r in reader), ignore_index=True)

    li = []
    df = df[df[0].isin([day])]
    hour = 7
    base = df.loc[:, 0:10]

    #Make hours list
    for i in range(19):
        if hour < 24:
            base.loc[:, 0] = pd.datetime(int(day[0:4]), int(day[4:6]),
                                         int(day[6:8]), hour)
        elif hour > 23:
            base.loc[:, 0] = pd.datetime(int(day[0:4]), int(
                day[4:6]), int(day[6:8]), hour - 24) + timedelta(days=1)

        hour += 1
        li.append(pd.concat([base, df.loc[:, 21 + i * 10:30 + i * 10]],
                            axis=1))

    #set columns
    for i in range(len(li)):
        li[i].columns = [j for j in range(19)]

    df3 = pd.concat(li)  #concat li. df3 is final dataframe
    df3 = df3[[0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17]]
    df3.columns = [
        'date_ymdh', 'ten_cd', 'sku_cd', 'dpt_cd', 'line_cd', 'class_cd',
        'sku_name', 'urisu', 'urikin', 'gsagsu1', 'gsaggk1', 'gsagsu2',
        'gsaggk2', 'gsagsu3', 'gsaggk3', 'garari'
    ]

    dbname = os.getenv('REDSHIFT_DB')
    host = os.getenv('REDSHIFT_HOST')
    port = os.getenv('REDSHIFT_PORT')
    user = os.getenv('REDSHIFT_USER')
    password = os.getenv('REDSHIFT_PASS')

    pr.connect_to_redshift(dbname=dbname,
                           host=host,
                           port=port,
                           user=user,
                           password=password)

    pr.connect_to_s3(aws_access_key_id=os.getenv('ACCESS_KEY_ID'),
                     aws_secret_access_key=os.getenv('SECRET_ACCESS_KEY'),
                     bucket='nichiji-tmp')

    pr.pandas_to_redshift(data_frame=df3,
                          redshift_table_name='jisseki_nichiji',
                          append=True)
def business_loss():
    images = [
        join("customer_lifetime_value/", f)
        for f in listdir("./static/customer_lifetime_value")
    ]

    db_name = "info7374dbassignment2"  #-------------------------------------Redshift: Database Name for gaming data

    master_username = "******"  #----------------------------------------Redshift: Admin Username
    master_password = "******"  #---------------------------------Redshift: Admin Password

    hostname = "info7374clusterproject.cwtvmzfhaqaf.us-east-1.redshift.amazonaws.com"  #----------------Redshift: Hostname for database
    port_number = 5439  #----------------Redshift: Port Number for databse

    pr.connect_to_redshift(dbname=db_name,
                           host=hostname,
                           port=port_number,
                           user=master_username,
                           password=master_password)

    data = pr.redshift_to_pandas('select * from sales')

    data = data.drop_duplicates()

    data = data[pd.notnull(data['customerid'])]

    data = data[(data['quantity'] > 0)]

    #most bought product
    data['description'].value_counts()[:10]

    #which customer bought the most number of items?
    cust_data = pd.DataFrame()
    cust_data['customerid'] = list(set(data['customerid']))
    cust_data = cust_data.set_index('customerid')
    for cust_id in cust_data.index:
        cust_data.at[cust_id, 'Number_of_items'] = (len(
            data[data['customerid'] == cust_id]['description']))
    cust_data = cust_data.sort_values('Number_of_items', ascending=False)

    #	stockcode	description					states
    data = data[[
        'customerid', 'invoicedate', 'invoiceno', 'quantity', 'unitprice'
    ]]
    #Calulate total purchase
    data['TotalPurchase'] = data['quantity'] * data['unitprice']

    data_group = data.groupby('customerid').agg({
        'invoicedate':
        lambda date: (date.max() - date.min()).days,
        'invoiceno':
        lambda num: len(num),
        'quantity':
        lambda quant: quant.sum(),
        'TotalPurchase':
        lambda price: price.sum()
    })

    # Change the name of columns
    data_group.columns = [
        'num_days', 'num_transactions', 'num_units', 'spent_money'
    ]
    data_group.head()

    # Average Order Value
    data_group['avg_order_value'] = data_group['spent_money'] / data_group[
        'num_transactions']

    purchase_frequency = sum(
        data_group['num_transactions']) / data_group.shape[0]

    # Repeat Rate
    repeat_rate = data_group[
        data_group.num_transactions > 1].shape[0] / data_group.shape[0]
    #Churn Rate
    churn_rate = 1 - repeat_rate

    purchase_frequency, repeat_rate, churn_rate

    # Profit Margin
    data_group['profit_margin'] = data_group['spent_money'].astype(
        'float') * 0.05

    # Customer Value
    data_group['CLV'] = (data_group['avg_order_value'].astype('float') *
                         purchase_frequency) / churn_rate
    #Customer Lifetime Value
    data_group['cust_lifetime_value'] = data_group['CLV'].astype(
        'float') * data_group['profit_margin'].astype('float')
    data_group.head()

    clv = data_group.loc[:, "cust_lifetime_value"].mean() / 1000000

    # drop the row missing customerID
    data = data[data.customerid.notnull()]

    # extract year, month and day
    data['invoiceday'] = data.invoicedate.apply(
        lambda x: dt.datetime(x.year, x.month, x.day))
    data.head()

    monthly_unique_customers_df = data.set_index(
        'invoiceday')['customerid'].resample('M').nunique()

    pd.DataFrame(monthly_unique_customers_df)['invoicedate'] = pd.DataFrame(
        monthly_unique_customers_df).index

    df = pd.DataFrame(monthly_unique_customers_df).reset_index()

    Customer_count = df.loc[:, "customerid"].mean()

    df["CustomerIDshift"] = [0] + list(df["customerid"][:-1])

    df["ChurnRate"] = (df["CustomerIDshift"] -
                       df["customerid"]) / df["CustomerIDshift"]

    df.rename(columns={'invoiceday': 'Month'}, inplace=True)

    df['ChurnRate'][0] = 1

    data = df.drop(columns=['customerid', 'CustomerIDshift'])

    table1 = data

    table1

    from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
    from statsmodels.tsa.arima_model import ARIMA

    data = data.set_index('Month')
    data.index

    model = ARIMA(data, order=(2, 1, 0))
    model_fit = model.fit(disp=0)
    print(model_fit.summary())
    # plot residual errors
    # residuals = pd.DataFrame(model_fit.resid)
    # residuals.plot()
    # plt.show()
    # residuals.plot(kind='kde')
    # plt.show()

    X = data.values
    history = [x for x in X]

    test = [
        '2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30', '2019-05-31',
        '2019-06-31'
    ]
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=(2, 1, 0))
        model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        history.append(yhat)
        predictions.append(yhat)
        print('predicted=%f' % (yhat))

    print(predictions)

    i = 0
    yes_array = []
    for value in predictions:
        print(predictions[i])
        yes_array.append(predictions[i])
        i += 1

    df_toplot = pd.DataFrame({"ChurnRate": yes_array, "Month": test})

    df_toplot["Business_Loss"] = df["ChurnRate"] * clv * Customer_count

    x = df_toplot["Business_Loss"].astype(int)

    df_toplot['Business Loss'] = x

    final_df = df_toplot

    del final_df['Business_Loss']

    table2 = final_df
    table1 = table1.to_html(classes="data")
    table2 = table2.to_html(classes="data")

    return render_template('business_loss.html',
                           tables=[table1, table2],
                           titles=["Blah", "Churn Rate", "Future Churn Rate"],
                           images=images,
                           clv=clv)
import numpy as np
import datetime as dt
from statsmodels.tsa.arima_model import ARIMA

db_name = "info7374dbassignment2"#-------------------------------------Redshift: Database Name for gaming data

master_username = "******"#----------------------------------------Redshift: Admin Username
master_password = "******"#---------------------------------Redshift: Admin Password


hostname = "info7374clusterproject.cwtvmzfhaqaf.us-east-1.redshift.amazonaws.com" #----------------Redshift: Hostname for database
port_number = 5439    #----------------Redshift: Port Number for databse

pr.connect_to_redshift(dbname = db_name ,
                        host = hostname,
                        port = port_number,
                        user = master_username,
                        password =master_password)

online = pr.redshift_to_pandas('select * from sales')

online.head(5)

# drop the row missing customerID
online = online[online.customerid.notnull()]

# extract year, month and day 
online['invoiceday'] = online.invoicedate.apply(lambda x: dt.datetime(x.year, x.month, x.day))
online.head()

monthly_unique_customers_df = online.set_index('invoiceday')['customerid'].resample('M').nunique()
str_user = secrets.str_user
str_pw = secrets.str_pw

# s3 configurations
str_accesskeyid = secrets.str_accesskeyid
str_secretaccesskey = secrets.str_secretaccesskey
str_s3bucket = secrets.str_s3bucket
str_s3subdirectory = secrets.str_s3subdirectory

# delete secrets.py
del secrets

# create pandas-redshift connection
pr.connect_to_redshift(dbname=str_dbname,
                       host=str_host,
                       port=str_port,
                       user=str_user,
                       password=str_pw)

# create dataframe from redshift query
sql_query = "SELECT * FROM <database>.<schema>.<table>;"
df = pr.redshift_to_pandas(sql_query)

print("Shape of dataframe: ", df.shape)

# create sample dataframe for upload
df_upload = pd.DataFrame({
    'a_col': ['red', 'green', 'blue'],
    'b_col': [1, 2, 3],
    'c_col': [True, False, True],
    'd_col': ['2020-01-01', '2020-02-04', '2020-03-06'],
Ejemplo n.º 24
0
import pandas_redshift as pr

pr.connect_to_redshift(
    dbname="dev",
    host=
    'redshift-cluster-1.cajhj66uu5bu.ap-northeast-1.redshift.amazonaws.com',
    port='5439',
    user='******',
    password='******')

df = pr.redshift_to_pandas('select * from test')
Ejemplo n.º 25
0
 def conectar_redshift(self):
     pr.connect_to_redshift(dbname=self.dbname,
                            host=self.host,
                            user=self.user,
                            port=5439,
                            password=self.password)