Example #1
0
def get_lps_share_integral_for_pools(_df, _realtime=None, _exclusions={}):
    LOGGER.debug('get_lps_share_integral_for_pools')
    pools_list = _df.index.get_level_values('pool_address').drop_duplicates()
    lm_shares_df = pd.DataFrame()
    for pool in pools_list:
        LOGGER.info('Computing shares of incentives for pool ' + pool)
        pool_data = _df.loc[pool]
        uneligible_lps = [
            address.lower() for address in _exclusions.get(pool, [])
        ]
        eligible_lps_pool_data = pool_data.query(
            f'lp_address not in {uneligible_lps}')
        if len(uneligible_lps) > 0:
            LOGGER.info(
                f'Total LPs: {len(pool_data.index.get_level_values("lp_address").drop_duplicates())}'
            )
            LOGGER.info(f'Uneligible LPs: {uneligible_lps}')
            LOGGER.info(
                f'Eligilble LPs: {len(eligible_lps_pool_data.index.get_level_values("lp_address").drop_duplicates())}'
            )
        lps_shares = get_lps_share_integral_for_pool(
            eligible_lps_pool_data,
            _realtime=_realtime,
            _exclusions=uneligible_lps)
        pool_df = pd.DataFrame(lps_shares)
        pool_df['pool_address'] = pool
        lm_shares_df = lm_shares_df.append(pool_df)
    lm_shares_df = lm_shares_df.reset_index().set_index(
        ['pool_address', 'lp_address'])
    return lm_shares_df
def train(model, epochs, optimizer, grad, train_dataset):
    """
     A main training function for training the models.
    """
    # Keep results for plotting
    train_loss_results = []
    train_accuracy_results = []

    for epoch in tqdm(range(epochs)):
        epoch_loss_avg = tf.keras.metrics.Mean()
        epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

        # Training loop - using batches of 32
        for x, y, timeStep in train_dataset:
            # Optimize the model
            loss_value, grads = grad(model, x, y)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Track progress
            epoch_loss_avg(loss_value)  # Add current batch loss
            # Compare predicted label to actual label
            # training=True is needed only if there are layers with different
            # behavior during training versus inference (e.g. Dropout).
            epoch_accuracy(y, model(x, training=True))

        # End epoch
        train_loss_results.append(epoch_loss_avg.result())
        train_accuracy_results.append(epoch_accuracy.result())

        if epoch % 50 == 0:
            LOGGER.info("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(
                epoch, epoch_loss_avg.result(), epoch_accuracy.result()))
Example #3
0
def get_lps_share_integral_for_pool(_df, _realtime=None, _exclusions={}):
    LOGGER.debug('get_lps_share_integral_for_pool')
    df = compute_LM_power_timeseries(_df)

    latest_timestamp = df.index.get_level_values('block_timestamp').max()
    intervals = compute_timestamp_intervals(
        df.index.get_level_values('block_timestamp'), _realtime)
    df = df.join(intervals)

    total_lm_power = compute_total_LM_power_timeseries(df)
    df = df.join(total_lm_power)

    df['lm_share'] = df['lm_power'] / df['total_lm_power']
    df['share_integral'] = df['lm_share'] * df['state_duration']

    latest_share = df.iloc[df.index.get_locs([slice(None),
                                              latest_timestamp])]['lm_share']
    latest_share = latest_share.droplevel('block_timestamp')

    total_share_integral = df['share_integral'].sum()
    lp_lm_share = df.groupby(
        'lp_address')['share_integral'].sum() / total_share_integral
    result = latest_share.to_frame().join(lp_lm_share)
    result.columns = ['latest_share', 'share_integral']
    return result
Example #4
0
def get_lm_allocations(_chain_id, _week_number=0, _realtime=None):
    LOGGER.debug('get_lm_allocations')
    week_passed = 3 / 7
    if _realtime:
        _week_number = get_current_lm_week_number()
        week_passed = get_percent_week_passed()

    jsonurl = urlopen(V2_LM_ALLOCATION_URL)
    try:
        week_allocation = json.loads(jsonurl.read())[f'week_{_week_number}']
    except KeyError:
        week_allocation = {}
    for chain_allocation in week_allocation:
        if chain_allocation['chainId'] == _chain_id:
            df = pd.DataFrame()
            for pool, rewards in chain_allocation['pools'].items():
                for r in rewards:
                    if r['tokenAddress'] in EXCLUDED_POOLS_TOKENS.get(
                            pool, []):
                        continue
                    pool_address = pool[:42].lower()
                    df.loc[pool_address,
                           r['tokenAddress']] = r['amount'] * week_passed
            if len(df) == 0:
                LOGGER.info('No incentives for this chain')
                continue
            df.fillna(0, inplace=True)
            df.index.name = 'pool_address'
            return df, week_passed
def clean_financial_data(df):
    """
     Cleans the data for further processing.
      * param: df - Dataframe.
      * returns: Cleaned dataframe.
    """
    LOGGER.info("Cleaning the financial data.")
    df_slice = df[['Identifier (RIC)',
                   'Company Name',
                   'Date of Insolvency',
                   'Score',
                   'Discrimination']]
    df = df.drop(['Identifier (RIC)', 'Company Name',
                  'Date of Insolvency', 'Score', 'Discrimination'], axis=1)
    columns = df.columns

    def convert_string_to_float(x):
        for col in columns:
            float(str(x[col]).replace(",", "").replace(
                ' -   ', str(0)).replace("%", "")) / 1000000
        return x

    df.replace(to_replace='#DIV/0!', value=0, inplace=True)
    df = df.apply(convert_string_to_float, axis=1)
    return pd.concat([df, df_slice], sort=False)
 def __next__(self):
     try:
         name, dataframe = next(self.groupby_iter)
         marketData = read_market_data(name)
         financial_data = clean_financial_data(dataframe)
         return merge_f_m_data(financial_data, marketData)
     except Exception as ex:
         LOGGER.error(f"Error occurred. {ex}. Skipping.")
Example #7
0
def fit(name: str, model: Estimator, params: t.Mapping[str, float],
        filename: Path, X: pd.DataFrame, y: pd.Series, /) -> None:
    LOGGER.info(f'Run {name}')
    m = (make_pipeline(model) if params is None else GridSearchCV(
        estimator=make_pipeline(model),
        param_grid=params,
        scoring='neg_root_mean_squared_error',
        n_jobs=3,
        cv=5)).fit(X, y)
    joblib.dump(value=m if params is None else m.best_estimator_,
                filename=filename)
Example #8
0
def apply_redirects(_data, _realtime=None, _redirects=None):
    if _redirects:
        redirects = _redirects
    else:
        redirects = get_redirects(_realtime)
    result = pd.DataFrame(_data).reset_index()
    result['lp_address'] = result['lp_address'].apply(Web3.toChecksumAddress)
    n = len(result['lp_address'][result['lp_address'].isin(redirects.keys())])
    LOGGER.debug(f'{n} redirectors found amongst the recipients')
    result['lp_address'] = result['lp_address'].apply(
        lambda x: redirects.get(x, x))
    result = result.groupby('lp_address').sum()
    return result, n
Example #9
0
def compute_timestamp_intervals(_blocks, _realtime=None):
    LOGGER.debug('compute_timestamp_intervals')
    blocks = pd.Series(_blocks).drop_duplicates().sort_values().values
    intervals = pd.Series(blocks, index=blocks).diff().shift(-1)
    if _realtime:
        intervals.iloc[-1] = datetime.datetime.utcnow() - intervals.index[-1]
    else:
        intervals.iloc[-1] = intervals.index[0] + \
            datetime.timedelta(days=7) - intervals.index[-1]
    intervals = intervals.dt.total_seconds()
    intervals.name = 'state_duration'
    intervals.index.name = 'block_timestamp'
    return intervals
def read_market_data(name):
    """
     Gets the market data from the data/market_data directory.
     param: name - Name of the company for which we want the data.
     return: Dataframe
    """
    parentDirectory = os.path.abspath(os.getcwd())
    try:
        LOGGER.info(f"Getting market data for {name}.")
        df = pd.read_csv(
            os.path.join(parentDirectory, 'data/market_data', f'{name}.csv'))
        return df
    except Exception as ex:
        LOGGER.error(f"An error occurred when fetching data for {name}, {ex}")
Example #11
0
def enrich_data(data):
    """
    Enhances data frame with information on indicators and price patterns. Indicators and patterns are align so
    that they represent details for previous minute.

    :param data: DataFrame
    :return: DataFrame
    """
    LOGGER.info("Adding TA-Lib indicators as features...")
    # We specifically do shifting here so that all additional data represents information about past history.
    df = pd.concat(
        (data, get_indicators(data).shift(), get_price_patterns(data).shift()),
        axis=1)
    df = df.fillna(method='ffill')
    return df.dropna()
def train(model, epoch, optimizer, grad, generator):
    """
     A main training function for training the models.
      Arguments:
      * model- Model to train.
      * epochs- number of epochs to train the model
      * optimizer- optimizer to be used.
      * grad- gradient function which computes gradient for the model layers.
      * generator- training dataset on which model is trained.
    """
    # Keep results for plotting
    train_loss_results = []
    train_accuracy_results = []

    # for epoch in range(epochs):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.BinaryAccuracy()

    # Training loop - using batches of 1
    for i in range(len(generator)):
        x, y, timeStep = generator.get_next_step()
        x = x.reshape(1, 1, x.shape[0])
        y = y.reshape(1, 1)
        timeStep = timeStep.reshape(1, 1)

        # Optimize the model
        loss_value, grads = grad(model, x, y, i, timeStep)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Track progress
        epoch_loss_avg(loss_value)  # Add current batch loss
        # Compare predicted label to actual label
        # training=True is needed only if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        epoch_accuracy(y, model(x, training=True))

    # End epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    LOGGER.info("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(
        epoch, epoch_loss_avg.result(), epoch_accuracy.result()))

    return train_loss_results, train_accuracy_results
Example #13
0
    def run(self, validate=False):
        best_f1 = 0
        for epoch in range(self.epochs):
            start_time = time.time()

            avg_loss = self.run_train_epoch()

            if validate:
                valid_preds, avg_val_loss, val_f1 = self.run_validation_epoch()
                elapsed_time = time.time() - start_time
                LOGGER.info('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f}  \t time={:.2f}s'.format(
                        epoch + 1, self.epochs, avg_loss, avg_val_loss, val_f1, elapsed_time))
                if val_f1 > best_f1:
                    self.save_model()
                    best_f1 = val_f1
            else:
                elapsed_time = time.time() - start_time
                LOGGER.info('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
                    epoch + 1, self.epochs, avg_loss, elapsed_time))
                self.save_model()
            self.scheduler.step()

        if not validate or val_f1 != best_f1:
            self.load_model()
            valid_preds, avg_val_loss, val_f1 = self.run_validation_epoch()
            elapsed_time = time.time() - start_time
            LOGGER.info('Fin {}epochs \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f}  \t time={:.2f}s'.format(
                self.epochs, avg_loss, avg_val_loss, val_f1, elapsed_time))

        test_preds = self.run_test_epoch()

        return valid_preds, test_preds
Example #14
0
def query_gbq(_network, _week_number, _pool_list, _excluded_lps_list=[]):
    LOGGER.debug('query_gbq')

    _excluded_lps_list = list(set(_excluded_lps_list + BASE_LP_EXCLUSION_LIST))

    with open(SQL_FILE_PATH, 'r') as file:
        sql = file.read()

    _days_in_week = '3'

    sql = sql.format(
        week_number=_week_number,
        pool_addresses="','".join(_pool_list),
        blocks_table=TABLES_CONFIGS[_network]['blocks'],
        lm_transfers_table=TABLES_CONFIGS[_network]['lm_transfers'],
        lm_state_table=TABLES_CONFIGS[_network]['lm_state'],
        excluded_lps="','".join(_excluded_lps_list),
        days_in_week=_days_in_week)
    client = bigquery.Client()
    bqstorageclient = bigquery_storage.BigQueryReadClient()
    df = (client.query(sql).result().to_dataframe(
        bqstorage_client=bqstorageclient))
    df = df.groupby(['pool_address', 'lp_address', 'block_timestamp']).sum()
    return df
def update_gbq_api_dataset(_full_export, _week_number):
    # zero previous week's velocity
    prev_week = _week_number - 1
    LOGGER.info(f'Zeroing velocity for week {prev_week}')
    sql = f'''
        UPDATE {PROJECT_ID}.bal_mining_estimates.lp_estimates_multitoken
        SET velocity = '0'
        WHERE week = {prev_week}
    '''
    client = bigquery.Client()
    query = client.query(sql)
    query.result()
    LOGGER.info('Done')

    _full_export.reset_index(inplace=True)
    LOGGER.info('Saving estimates to staging...')
    _full_export.to_gbq('bal_mining_estimates.lp_estimates_multitoken_staging',
                        project_id=PROJECT_ID,
                        if_exists='replace')
    LOGGER.info('Done')

    # merge staging into prod
    sql = '''
    MERGE bal_mining_estimates.lp_estimates_multitoken prod
    USING bal_mining_estimates.lp_estimates_multitoken_staging stage
    ON prod.address = stage.address
    AND prod.week = stage.week
    AND prod.chain_id = stage.chain_id
    AND prod.token_address = stage.token_address
    WHEN MATCHED THEN
        UPDATE SET 
            earned = stage.earned,
            velocity = stage.velocity,
            timestamp = stage.timestamp
    WHEN NOT MATCHED BY TARGET THEN
        INSERT (address, week, chain_id, token_address, earned, velocity, timestamp)
        VALUES (address, week, chain_id, token_address, earned, velocity, timestamp)
    '''
    client = bigquery.Client()
    query = client.query(sql)
    query.result()
def save_report(_week_number, _chain_id, _token, _data):
    LOGGER.debug(f'saving {_token} report...')
    network = NETWORKS[_chain_id]
    reports_dir = f'reports/{_week_number}'
    if not os.path.exists(reports_dir):
        os.mkdir(reports_dir)
    filename = f'{reports_dir}/__{network}_{_token}.json'
    export_data = _data[_data > get_claim_threshold(_token)]
    export = export_data.apply(
        lambda x: format(x, f'.{get_claim_precision(_token)}f'))
    export_json = export.to_json()
    parsed_export = json.loads(export_json)
    with open(filename, "w") as write_file:
        json.dump(parsed_export, write_file, indent=4)
    LOGGER.debug(f'saved to {filename}')
    if _chain_id == 1 and _token == '0xba100000625a3754423978a60c9317c58a424e3d':
        filename = f'{reports_dir}/_totals.json'
        with open(filename, "w") as write_file:
            json.dump(parsed_export, write_file, indent=4)
        LOGGER.debug(f'saved to {filename}')
Example #17
0
def compute_LM_power_timeseries(_df):
    LOGGER.debug('compute_LM_power_timeseries')
    df = _df.copy()
    df['lm_power'] = df.sort_index().groupby(['lp_address']).cumsum()
    df = df.drop(columns=['delta'])
    df = df.clip(lower=0)

    lp_addresses_list = (df.index.get_level_values(
        'lp_address').drop_duplicates().sort_values().values)
    block_timestamps_list = (df.index.get_level_values(
        'block_timestamp').drop_duplicates().sort_values().values)
    levels = [lp_addresses_list, block_timestamps_list]
    new_index = pd.MultiIndex.from_product(
        levels,
        names=['lp_address', 'block_timestamp'],
    )
    df = df.tz_localize(None, level='block_timestamp')
    LOGGER.debug('reindexing ({})...'.format(len(block_timestamps_list)))
    df = df.reindex(index=new_index)
    df.loc(axis=0)[:, block_timestamps_list[0]] = df.loc(
        axis=0)[:, block_timestamps_list[0]].fillna(0)
    df = df.fillna(method='pad')
    LOGGER.debug('done')
    return df
from sklearn.pipeline import make_pipeline
from pandas_profiling import ProfileReport
from pathlib import Path

p = Path(__file__).parents[1]

# To load project modules
import sys; sys.path.append(str(p))
from src.logger import LOGGER
from src import estimators as e
from src.ranker import Ranker


A4_DIMS = (11.7, 8.27)

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index)

LOGGER.info('Load categorical features to drop')
noVarFeatures = json.load(open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r'))

LOGGER.info('Process categorical features')
catf = pd.DataFrame(
    data=make_pipeline(
        e.CategoricalGrouper(),
        e.CategoricalEncoder()
Example #19
0
import pandas as pd
import numpy as np
from sklearn.preprocessing import (power_transform, quantile_transform, scale,
                                   StandardScaler, FunctionTransformer)
from sklearn.pipeline import make_pipeline
from pathlib import Path

p = Path(__file__).parents[1]

# To load project modules
import sys
sys.path.append(str(p))
from src.logger import LOGGER
from src import estimators as e

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(),
              name='loss',
              index=y.index)

LOGGER.info('Load categorical features to drop')
noVarFeatures = json.load(
    open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r'))

LOGGER.info('Process categorical features')
catf = pd.DataFrame(data=make_pipeline(
Example #20
0
        data=pca.components_,
        columns=cols,
        index=[f'Component {c+1}' for c in range(len(cols))]),
                     cmap='RdBu',
                     annot=True,
                     fmt='.2f',
                     cbar=False)
    ax.set(title=title)
    plt.tight_layout()
    fig.savefig(fname=p.joinpath('reports', 'figures', out),
                dpi=800,
                format='png')
    plt.close(fig=fig)


LOGGER.info('Load correlated features')
CORRELATED = json.load(
    open(file=p.joinpath('src', 'meta', 'Correlated.json'), mode='r'))

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.filter(CORRELATED)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(),
              name='loss',
              index=y.index)

LOGGER.info('Process categorical features')
catf = pd.DataFrame(data=make_pipeline(e.CategoricalGrouper(),
Example #21
0
    '--rt',
    help='pass 1 to run realtime estimator; ignores week parameter',
    type=int)
parser.add_argument(
    '--logs',
    help='{WARNING, DEBUG, INFO}',
    type=str)
args = parser.parse_args()

# initialize the logger
if args.logs:  # if realtime = True
    log_level = args.logs.upper()
logger_init(log_level)

if args.rt:  # if realtime = True
    LOGGER.info('Running realtime estimator')
    week_number = this_week
    realtime_estimator = True
    if args.week:
        LOGGER.warning('Running realtime estimator, ignoring week parameter')
elif args.week:
    week_number = args.week
    if week_number > this_week:
        exit(
            f'Error: week {week_number} is in the future, this is week {this_week}')
    if week_number == this_week:
        LOGGER.warning(
            f'Warning: week {week_number} is not over yet. Did you mean to run the realtime estimator?')


# for each network
Example #22
0
import pandas as pd
from pandas_profiling import ProfileReport
from pathlib import Path

p = Path(__file__).parents[1]

# To load project modules
import sys
sys.path.append(str(p))

from src.logger import LOGGER

LOGGER.info('Load data')
res = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
dev = pd.read_pickle(p.joinpath('data', 'interim', 'development.pkl'))

LOGGER.info('Profile on research')
(ProfileReport(df=res,
               config_file=p.joinpath('src', 'ProfileConf.yml'),
               title='Research dataset',
               dataset={
                   'description': r'Research dataset (10% of total data)'
               }).to_file(p.joinpath('reports', 'profiles', 'research.html')))

LOGGER.info('Profile on development')
(ProfileReport(df=dev,
               config_file=p.joinpath('src', 'ProfileConf.yml'),
               title='Development dataset',
               dataset={
                   'description': r'Development dataset (10% of total data)'
               }).to_file(p.joinpath('reports', 'profiles',
sys.path.append(str(p))
from src.logger import LOGGER
from src import preprocessors as pp
from src.ranker import Ranker

CROSS_VAL_OPTS = {
    'scoring': 'neg_root_mean_squared_error',
    'cv': RepeatedKFold(n_splits=5, n_repeats=20),
    'n_jobs': 3,
    'return_train_score': False,
    'return_estimator': False,
}

r = Ridge(random_state=0, solver='saga', max_iter=1e5)

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
tp = pp.TargetPreprocessor(
)  # will be needed to convert back to y original units
y = tp.fit_transform(y)

LOGGER.info('Process X')
X = pp.Preprocessor().fit_transform(X, y)

LOGGER.info('Prepare dummy regressor')
dummyRMSE = mean_squared_error(y_true=y,
                               y_pred=DummyRegressor(strategy='mean').fit(
import pandas as pd
import numpy as np
from pathlib import Path

p = Path(__file__).parents[1]

# To load project modules
import sys; sys.path.append(str(p))
from src.logger import LOGGER
from src import preprocessors as pp
from src.ranker import Ranker

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pp.TargetPreprocessor().fit_transform(y)

LOGGER.info('Process X')
X = pp.Preprocessor().fit_transform(X, y)

LOGGER.info('Create ranking')
rnk = Ranker().rank(X, y)
rnk['Rank'] = rnk['Association Strength'].abs().rank(ascending=False)
rnk['Group'] = np.ceil(rnk['Rank'].div(5))
rnk.to_pickle(p.joinpath('src', 'meta', 'Ranking.pkl'))
Example #25
0
import json
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold
from pathlib import Path

p = Path(__file__).parents[1]

# To load project modules
import sys

sys.path.append(str(p))
from src.logger import LOGGER
from src import estimators as e

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.filter(like='cat')
y = df['loss'].copy()

LOGGER.info('Process data')
X = pd.DataFrame(data=make_pipeline(e.CategoricalGrouper(),
                                    e.CategoricalEncoder()).fit_transform(
                                        X, y),
                 columns=X.columns,
                 index=X.index)

LOGGER.info('Variance threshold analysis')
vt = VarianceThreshold().fit(X)
drop = [col for col in X if col not in X.columns[vt.get_support()]]
# Setup to load modules in `src`
import sys
sys.path.append(str(p))
from src.logger import LOGGER
from src.preprocessors import TargetPreprocessor

CROSS_VAL_OPTS = {
    'scoring': 'neg_root_mean_squared_error',
    'cv': RepeatedKFold(n_splits=5, n_repeats=20),
    'n_jobs': 3,
    'return_train_score': False,
    'return_estimator': False,
}

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'development.pkl'))

LOGGER.info('Load selected features')
FEATURES = json.load(
    open(file=p.joinpath('src', 'meta', 'SelectedFeatures.json'), mode='r'))

LOGGER.info('Get X and y')
X = df.filter(FEATURES)
y = df['loss'].copy()

LOGGER.info('Process target')
tp = TargetPreprocessor()  # will be needed to convert back to y original units
y = tp.fit_transform(y)

LOGGER.info('Load models')
Example #27
0
 def save_model(self):
     with open(self.model_path, "wb") as fout:
         torch.save(self.model.state_dict(), fout)
     LOGGER.info("Model was saved in {}".format(self.model_path))
Example #28
0
import pandas as pd
from pathlib import Path

p = Path(__file__).parents[1]

# To load project modules
import sys
sys.path.append(str(p))
from src.logger import LOGGER
from src import preprocessors as pp
from pandas_profiling import ProfileReport

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pp.TargetPreprocessor().fit_transform(y)

LOGGER.info('Process X')
X = pp.Preprocessor().fit_transform(X, y)

LOGGER.info('Profile final data')
ProfileReport(df=X.join(y),
              config_file=p.joinpath('src', 'ProfileConf.yml'),
              title='Final dataset').to_file(
                  p.joinpath('reports', 'profiles', 'final.html'))
def merge_f_m_data(financial_df, market_df):
    """
     Merge both dataframe to get the final data. The shape of the financal df will be (2*586) while market df will be of (2000*13)
     returns the merged data.
    """
    market_df['Date'] = pd.to_datetime(market_df['Date'])
    columns = financial_df.columns
    transposed_df = financial_df.T
    transposed_df['Row names'] = columns
    transposed_df['Date'] = 0
    date_of_solvency = financial_df['Date of Insolvency']
    # here 1 = Defaulted, 0 = Non-default
    default_score = financial_df['Score']
    LOGGER.info('Merging market data and financial data into one.')

    def get_dates(x):
        # Gives date to the FY-X term.
        for key in DATE_MAP.keys():
            if pd.to_datetime(f'1/1/{DATE_MAP[key]}').dayofweek == 5:
                date = f'1/3/{DATE_MAP[key]}'
            elif pd.to_datetime(f'1/1/{DATE_MAP[key]}').dayofweek == 6:
                date = f'1/2/{DATE_MAP[key]}'
            else:
                date = f'1/1/{DATE_MAP[key]}'

            if x['Row names'].find(key) >= 0:
                x['Date'] = date
        return x

    transposed_df = transposed_df.apply(get_dates, axis=1)
    transposed_df['Date'] = transposed_df['Date'].replace(0, '1/1/2019')
    transposed_df['Date'] = pd.to_datetime(transposed_df['Date'])
    transposed_df = transposed_df.fillna(method='bfill')
    transposed_df = transposed_df.loc[:, ~transposed_df.columns.duplicated()]
    
    required_cols = ['Row names', 'Date']
    transposed_df_cols = transposed_df.columns
    for col in transposed_df_cols:
        if col not in required_cols:
            transposed_df.rename(columns = {col : 'Data'}, inplace=True)
    
    added_cols = []
    final_data = {}
    for index, row in transposed_df.iterrows():
        last_added_col = None if len(added_cols) == 0 else added_cols[-1]
        if len(added_cols) > 0 and row['Row names'].find(last_added_col) >= 0:
            try:
                final_data[last_added_col].append(row['Data'])
                final_data['Date'].append(row['Date'])
            except KeyError:
                final_data[last_added_col] = [row['Data']]
        elif len(added_cols) == 0:
            added_cols.append(row['Row names'])
            final_data[row['Row names']] = [row['Data']]
            final_data['Date'] = [row['Date']]
        elif str(row['Row names']).find(added_cols[-1]) < 0:
            added_cols.append(row['Row names'])
            final_data[row['Row names']] = [row['Data']]

    final_data['Date'] = final_data['Date'][0:15]
    cleaned_f_df = pd.DataFrame(
        dict([(k, pd.Series(v)) for k, v in final_data.items()]))
    merged_df = market_df.merge(cleaned_f_df, on='Date', how='outer')
    merged_df = merged_df.drop(['Identifier (RIC)',
                                'Company Name',
                                'Date of Insolvency',
                                'Score',
                                'Discrimination',
                                'Z Score'],
                               axis=1)
    merged_df = merged_df.sort_values(by='Date', ascending=False)
    merged_df = merged_df.fillna(method='bfill').fillna(method='ffill')
    return (merged_df, date_of_solvency, default_score)
Example #30
0

def fit(name: str, model: Estimator, params: t.Mapping[str, float],
        filename: Path, X: pd.DataFrame, y: pd.Series, /) -> None:
    LOGGER.info(f'Run {name}')
    m = (make_pipeline(model) if params is None else GridSearchCV(
        estimator=make_pipeline(model),
        param_grid=params,
        scoring='neg_root_mean_squared_error',
        n_jobs=3,
        cv=5)).fit(X, y)
    joblib.dump(value=m if params is None else m.best_estimator_,
                filename=filename)


LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'development.pkl'))

LOGGER.info('Load selected features')
FEATURES = json.load(
    open(file=p.joinpath('src', 'meta', 'SelectedFeatures.json'), mode='r'))

LOGGER.info('Get X and y')
X = df.filter(FEATURES)
y = df['loss'].copy()

LOGGER.info('Process target')
y = TargetPreprocessor().fit_transform(y)

# Fitting
for name, model, params in MODELS: