Exemple #1
0
def get_events_data():
    """Get well events - takes ~30s"""

    f_start = time.time()

    # read in event data & fix column names
    events = pd.read_excel(xlsx, sheetname='INTERVENTIONS_OFM')
    events.columns = ['UNIQUEID', 'Date', 'Year', 'Month', 'Day', 'Event']

    # fix dates
    events['Date'] = events['Date'].dt.to_period('M').dt.to_timestamp('M')
    events.drop(['Year', 'Day', 'Month'], axis=1, inplace=True)

    # filter to shorter list of events
    keep_events = [
        'Chemical Stimulation', 'Cleaning', 'COMPLETION', 'Pump Replacement',
        'Reactivation', 'Rod Change', 'SRP Installation', 'Workover',
        'zone isolated with arena'
    ]
    events = events.loc[events['Event'].isin(keep_events)]

    # dummy events & filter to shorter list of events
    events['Event'] = events.Event.str.replace(' ', '_')
    events = pd.get_dummies(events, columns=['Event'])

    # merge multiple events per month into single row
    events = events.groupby(['UNIQUEID', 'Date']).sum()
    events = events.applymap(lambda x: int(bool(x)))
    events.reset_index(inplace=True)

    utils.print_log('events prepped: ' +
                    utils.secs_to_hms(time.time() - f_start))

    return events
Exemple #2
0
def get_general_data():
    """Get data from general tab - takes ~25 seconds"""

    f_start = time.time()

    # read in data
    gen = pd.read_excel(xlsx, sheetname='GENERAL')

    # some cleanup
    gen.drop(['WELLBORE', 'Completation_Date'], axis=1, inplace=True)
    gen['PRODUCER'] = gen['PRODUCER'].replace({'Y': 1, 'N': 0})

    # drop columns identified by slb as unnecessary
    gen.drop(
        ['SUB_STATE', 'COMPLETION_PACKING', 'COLLAPSED', 'Completation_Type'],
        axis=1,
        inplace=True)

    # clean & dummy categorical columns
    gen['RESERVOIR'] = gen['RESERVOIR'].replace(
        np.nan, 'RESERVOIR(DNE)').apply(
            lambda x: re.search(r'.*\((.*)\)', x).group(1)).replace(
                'DNE', np.nan)
    # gen = pd.get_dummies( gen, columns=[ 'RESERVOIR', 'WELLTYPE', 'SUB_STATE', 'COMPLETION_PACKING', 'COLLAPSED', 'Completation_Type' ] )
    gen = pd.get_dummies(gen, columns=['RESERVOIR', 'WELLTYPE'])

    utils.print_log('general prepped: ' +
                    utils.secs_to_hms(time.time() - f_start))

    return gen
Exemple #3
0
def get_production_data():
    """Get monthly production data - takes ~2min"""

    f_start = time.time()

    # read in production data
    prod_df = pd.DataFrame()
    for sheet in [
            'Monthly_PROD1_OFM', 'Monthly_PROD2_OFM', 'Monthly_PROD3_OFM',
            'Monthly_PROD4_OFM'
    ]:
        sheet_df = pd.read_excel(xlsx, sheetname=sheet)
        prod_df = pd.concat([prod_df, sheet_df], ignore_index=True)

    # drop missing values
    start = time.time()
    prod_df.dropna(
        how='all', inplace=True
    )  # 2208 rows in PROD2 and 617 rows in PROD3 are completely blank
    prod_df.dropna(
        subset=['OIL [bbl]', 'GAS [Mcf]', 'WATER [bbl]'],
        how='all',
        inplace=True
    )  # 12 rows in PROD2, 9 rows in PROD3, and 10 rows in PROD4 have no prod values for rows with an assigned UNIQUEID & date
    utils.print_log('drop missing: ' + utils.secs_to_hms(time.time() - start))

    # resample to ensure monthly increments
    start = time.time()
    prod_df = prod_df.set_index('Date').groupby('UNIQUEID').resample(
        '1M').sum().reset_index()
    prod_df.loc[~prod_df['DAYS [days]'].isnull(), 'Well_Open'] = 1
    prod_df['Well_Open'].fillna(0, inplace=True)
    prod_df['OIL [bbl]'].fillna(0, inplace=True)
    prod_df['GAS [Mcf]'].fillna(0, inplace=True)
    prod_df['WATER [bbl]'].fillna(0, inplace=True)
    prod_df.drop('DAYS [days]', axis=1, inplace=True)
    utils.print_log('resample: ' + utils.secs_to_hms(time.time() - start))

    # add counter for months of production history
    start = time.time()
    prod_df['Well_Age'] = prod_df.groupby('UNIQUEID').cumcount() + 1
    utils.print_log('age counter: ' + utils.secs_to_hms(time.time() - start))

    utils.print_log('production prepped: ' +
                    utils.secs_to_hms(time.time() - f_start))
    return prod_df
Exemple #4
0
def get_xycdate_data():
    """Get XY coordinates, elevation, depth, and glevel - takes ~25 seconds"""

    f_start = time.time()

    # read in data
    xyc = pd.read_excel(xlsx, sheetname='XY-CDATE')

    # some cleanup
    xyc.drop(['CDATE', 'ALIAS', 'WELLBORE'], axis=1, inplace=True)

    # interpolate missing values
    for col in ['KBELEVATION [ft]', 'TOTALDEPTH [ft]', 'GLEVEL']:
        interpolate(col, xyc)

    utils.print_log('xyc prepped: ' + utils.secs_to_hms(time.time() - f_start))

    return xyc
Exemple #5
0
def get_warter_data():
    """Get warter data"""

    f_start = time.time()

    # read in data
    warter = pd.read_excel(xlsx, sheetname='Monthly_Warter_INJ_OFM')

    # some cleanup
    warter.drop(['DAYS [days]'], axis=1, inplace=True)

    # resample to ensure monthly increments, fill nans with 0
    warter = warter.set_index('Date').groupby('UNIQUEID').resample('M').mean()
    warter['WINJ [bbl]'] = warter['WINJ [bbl]'].fillna(0)
    warter['PINJ [psig]'] = warter['PINJ [psig]'].fillna(0)
    warter.reset_index(inplace=True)

    utils.print_log('warter prepped: ' +
                    utils.secs_to_hms(time.time() - f_start))

    return warter
    def write_summary(self, popn, treatmt, tot_cycles, elapsed_time):
        """
        Write simulation summary to file(s).

        Write summary data about this simulation run
        to two summary files, in CSV format:

        'testgroup_results.csv' (master summary file for this test group)
        'testgroup_paramset_results.csv' (summary for this param set only)

        Args
        ----
        popn : a tumour population
        treatmt : a treatment regime
        tot_cycles : total number of cycles for this simulation
        elapsed_time : simulation runtime in seconds

        Returns
        -------
        None
        """
        # Get pre-crash min and max population size, and
        # the times at which they occurred
        precrash_minmax_data = analytics.precrash_minmax(treatmt, popn)
        min_val, min_time, max_val, max_time = precrash_minmax_data

        cmin_val = cmin_time = cmax_val = cmax_time = 0

        went_through_crash = 'N'
        if analytics.went_through_crash(treatmt, popn):
            went_through_crash = 'Y'
            #Get post-crash min and max values
            postcrash_minmax_data = analytics.postcrash_minmax(treatmt, popn)
            cmin_val, cmin_time, cmax_val, cmax_time = postcrash_minmax_data
            #hasty fix for calculating max time
            # if cmax_time == 0 and recovered:
            #    cmax_time = tot_cycles

        # determine whether, when and how fully the population recovered
        recovery_status = analytics.completion_status(self, treatmt, popn)
        recovered, recover_type, recover_percent = recovery_status

        # open files (these should have been created already)
        tg_summary_fpath = "{0}/{1}_results.csv".format(
            self.test_group_dir, self.test_group)
        ps_summary_fpath = "{0}/{1}_{2}_results.csv".format(
            self.param_set_dir, self.test_group, self.param_set)
        tg_results_file = open(tg_summary_fpath, 'a')
        ps_results_file = open(ps_summary_fpath, 'a')
        tg_writer = csv.writer(tg_results_file)
        ps_writer = csv.writer(ps_results_file)

        # determine proportion of tumour taken up by dominant clone
        if not popn.is_dead():
            dom_clone_size = analytics.get_dom_clone_size(popn.subpop)
            dom_clone_proportion = dom_clone_size / float(popn.tumoursize)
        else:
            dom_clone_proportion = 0

        # determine average depth of clones in tumour
        avg_depth = analytics.get_avg_depth(popn)

        # calculate total number of mutations
        total_mutns = 0
        for mut_type in popn.all_mutations:
            total_mutns += len(popn.all_mutations[mut_type])

        generated_resist_mutns = len(popn.all_mutations['r'])
        surviving_resist_mutns = 0
        for mut in popn.all_mutations['r']:
            if not mut.original_clone.is_dead_end():
                surviving_resist_mutns += 1

        # assemble values to write
        summary_vals = (self.param_set, self.run_number, went_through_crash,
                        recovered, recover_type, recover_percent, popn.opt.pro,
                        popn.opt.die, popn.opt.mut, treatmt.crash_time,
                        treatmt.init_select_pressure,
                        max(popn.analytics_base.select_pressure),
                        popn.opt.prob_mut_pos, popn.opt.prob_mut_neg,
                        popn.opt.prob_inc_mut, popn.opt.prob_dec_mut,
                        popn.analytics_base.tumoursize[-1],
                        popn.analytics_base.clonecount[-1],
                        '{:.5f}'.format(dom_clone_proportion),
                        '{:.5f}'.format(avg_depth),
                        popn.analytics_base.avg_mutation[-1],
                        popn.analytics_base.avg_proliferation[-1],
                        secs_to_hms(elapsed_time), tot_cycles, total_mutns,
                        generated_resist_mutns, surviving_resist_mutns,
                        min_val, min_time, max_val, max_time, cmin_val,
                        cmin_time, cmax_val, cmax_time)

        tg_writer.writerow(summary_vals)
        ps_writer.writerow(summary_vals)
        tg_results_file.close()
        ps_results_file.close()
Exemple #7
0
def get_intervals():
    """Get intervals data"""

    f_start = time.time()

    # read in data
    intervals_df = pd.DataFrame()
    for sheet in ['INTERVALS_1', 'INTERVALS_2']:
        sheet_df = pd.read_excel(xlsx, sheetname=sheet)
        intervals_df = pd.concat([intervals_df, sheet_df], ignore_index=True)
    intervals_df[
        'DK_Diff'] = intervals_df['TOP [ft]'] - intervals_df['BASE [ft]']
    intervals_df.rename(columns={'STATUS': 'DK_STATUS'}, inplace=True)
    intervals_df = pd.get_dummies(intervals_df, columns=['DK_STATUS'])

    # get aggregate level data
    aggregates = intervals_df.groupby('UNIQUEID')['DK'].max().to_frame(
    )  # start off with number of intervals per well
    aggregates.rename(columns={'DK': 'Total_Num_DK'}, inplace=True)
    for metric in ['TOP [ft]', 'BASE [ft]', 'DK_Diff']:

        # calc average depth across all DK
        new_col = 'Avg_DK_{}'.format(metric)
        aggregates[new_col] = intervals_df.groupby('UNIQUEID')[metric].mean()

        # calc min depth across all DK
        new_col = 'Min_DK_{}'.format(metric)
        aggregates[new_col] = intervals_df.groupby('UNIQUEID')[metric].min()

        # calc max depth across all DK
        new_col = 'Max_DK_{}'.format(metric)
        aggregates[new_col] = intervals_df.groupby('UNIQUEID')[metric].max()

    aggregates.reset_index(inplace=True)

    # get time series data
    status_cols = [c for c in intervals_df if c.startswith('DK_STATUS_')]
    int_ts = intervals_df.groupby(['UNIQUEID', 'Date'])[status_cols].sum()
    open_dk = intervals_df.loc[intervals_df['DK_STATUS_OPEN'] == 1]
    for metric in ['TOP [ft]', 'BASE [ft]', 'DK_Diff']:

        # calc average depth across all open DK
        new_col = 'Avg_Open_DK_{}'.format(metric)
        int_ts[new_col] = open_dk.groupby(['UNIQUEID', 'Date'])[metric].mean()

        # calc average depth across all open DK
        new_col = 'Min_Open_DK_{}'.format(metric)
        int_ts[new_col] = open_dk.groupby(['UNIQUEID', 'Date'])[metric].min()

        # calc average depth across all open DK
        new_col = 'Max_Open_DK_{}'.format(metric)
        int_ts[new_col] = open_dk.groupby(['UNIQUEID', 'Date'])[metric].max()

    # resample time series data from daily to monthly
    int_ts.reset_index(inplace=True)
    int_ts['Date'] = int_ts['Date'].dt.to_period('M').dt.to_timestamp('M')
    int_ts = int_ts.groupby(['UNIQUEID', 'Date']).last()
    int_ts.fillna(0, inplace=True)
    int_ts.reset_index(inplace=True)

    # combine aggregate with time series data
    all_interval = int_ts.merge(aggregates, how='left', on='UNIQUEID')

    utils.print_log('intervals prepped: ' +
                    utils.secs_to_hms(time.time() - f_start))

    return all_interval