def cross_validate(train_files, test_files, model_name, exp, kfolds): # Complete args model_name = model_name.lower() if model_name else input( 'model_name=?').lower() exp_num = 'exp_' n = str(exp) if exp else input('exp_?') exp_num += n logger.info('---------- Cross validation {} on {} start ----------'.format( model_name, exp_num)) # Load data logger.info('Loading training set: {}'.format(list(train_files))) train_df = load_corpus(list(train_files)) if test_files != (): logger.info('Loading test set: {}'.format(list(test_files))) test_df = load_corpus(list(test_files)) else: test_df = None # Load configs cfg_path = get_envar('CONFIG_PATH') + '/' + get_envar('BASE_CONFIG') logger.info('Loading base_configs from {}'.format(cfg_path)) base_configs = read_config(cfg_path, obj_view=True) logger.info('Loading exp_configs on {} from {}'.format( exp_num, base_configs.exp_configs.path)) exp_configs = read_config(base_configs.exp_configs.path, obj_view=False)[exp_num] description = exp_configs['description'] hyparams = exp_configs['hyperparams'] logger.info('Experiment description: {}'.format(description.strip())) logger.info('Hyperparams: {}'.format(hyparams)) wdir = base_configs.model.savepath + get_timestamp() + '/' # CV kf = KFold(n_splits=kfolds, shuffle=True, random_state=42) cv = {} for k, (train_idx, val_idx) in enumerate(kf.split(train_df)): logger.info(f'-- Cross validation split {k+1} --') rec = train_validate(model_name, hyparams, train_df.iloc[train_idx], train_df.iloc[val_idx], test_df) cv.update({f'CV_{k+1}': rec}) (_, cv_val, cv_test), df = best_scores(cv, complete=False) logger.info( f'**CV RESULTS** val_acc3={cv_val:.2%} test_acc3={cv_test:.2%}') df.to_clipboard() logger.info(f'CV details copied to clipboard \n{df}') logger.info('---------- Cross validation {} on {} end ----------'.format( model_name, exp_num))
def test_read_conf(self): config = read_config(self.config_path) sections = [x for x in config.keys()] for x in ['general', 'scraping']: with self.subTest(x=x): self.assertTrue(x in sections) # test general sections subsections = [x for x in config['general'].keys()] for x in ['db_uri']: with self.subTest(x=x): self.assertTrue(x in subsections) # test scraping sections subsections = [x for x in config['scraping'].keys()] for x in [ 'years_range', 'folder_images', 'folder_thumbnails', 'converter', 'n_proc' ]: with self.subTest(x=x): self.assertTrue(x in subsections) self.assertTrue(isinstance(config['scraping']['years_range'], list)) self.assertTrue(isinstance(config['scraping']['n_proc'], int)) self.assertTrue(len(config['scraping']['years_range']) == 2)
def __init__(self, lexicon_manager=None, x_col='SENT', y_col='CLS', asp_col='ASP'): self.x_col = x_col self.y_col = y_col self.asp_col = asp_col self.lm = lexicon_manager self.configs = read_config(get_envar('CONFIG_PATH')+'/'+get_envar('BASE_CONFIG'), obj_view=True) self.__initialize()
def __init__(self, lx_path=None, usecol=-1, lx_size=-1, append_neg=False): """ Parameters ---------- lx_path : str path to lexicon table without '.csv' extension usecol : int () or list (list of column names) which lexicons to use """ self.usecol = usecol self.lx_size = lx_size self.append_neg = append_neg if lx_path is None: configs = read_config(get_envar('CONFIG_PATH') + '/' + get_envar('BASE_CONFIG'), obj_view=True) self.lx_path = configs.lexicon_table.path + '.csv' self.usecol = configs.lexicon_table.usecol self.lx_size = configs.lexicon_table.lx_size self.append_neg = configs.lexicon_table.append_neg else: self.lx_path = lx_path + '.csv' self.__initialize()
def __init__(self, path_config='./config/production.conf'): path_config = os.getenv('configapi') self.config = read_config(path_config) self.db = get_db(self.config['general']['db_uri'])
def train(train_files, val_files, test_files, model_name, exp): # Complete args model_name = model_name.lower() if model_name else input( 'model_name=?').lower() exp_num = 'exp_' n = str(exp) if exp else input('exp_?') exp_num += n logger.info('---------- Training {} on {} start ----------'.format( model_name, exp_num)) # Load data logger.info('Loading training set: {}'.format(list(train_files))) train_df = load_corpus(list(train_files)) if val_files != (): logger.info('Loading validation set: {}'.format(list(val_files))) val_df = load_corpus(list(val_files)) else: val_df = None if test_files != (): logger.info('Loading test set: {}'.format(list(test_files))) test_df = load_corpus(list(test_files)) else: test_df = None # Load configs cfg_path = get_envar('CONFIG_PATH') + '/' + get_envar('BASE_CONFIG') logger.info('Loading base_configs from {}'.format(cfg_path)) base_configs = read_config(cfg_path, obj_view=True) logger.info('Loading exp_configs on {} from {}'.format( exp_num, base_configs.exp_configs.path)) exp_configs = read_config(base_configs.exp_configs.path, obj_view=False)[exp_num] description = exp_configs['description'] hyparams = exp_configs['hyperparams'] logger.info('Experiment description: {}'.format(description.strip())) logger.info('Hyperparams: {}'.format(hyparams)) wdir = base_configs.model.savepath + get_timestamp() + '/' # Build Model lm = LexiconManager() dm = AbsaDataManager(lexicon_manager=lm) model = VALID_MODELS[model_name.lower()] model = model(datamanager=dm, parameters=hyparams) # Train model.train(train_df, val_df, test_df) # Predict and score on test if test_df is not None: _, _, loss_, acc3_ = model.score(test_df) logger.info('Final score on test set: ' 'test_loss={loss:.4f} ' \ 'test_acc3={acc:.2%}'\ .format(loss=loss_, acc=acc3_)) # Save model model.save(wdir) # Close tf.Session, not really necessary but... anyway model.close_session() logger.info('---------- Training {} on {} end ----------'.format( model_name, exp_num))
import neptune from neptunecontrib.versioning.data import log_data_version from neptunecontrib.api.utils import get_filepaths from category_encoders import OrdinalEncoder from pandarallel import pandarallel import pandas as pd import numpy as np from src.features.const import ID_COLS, V1_COLS, V1_CAT_COLS from src.utils import read_config, check_env_vars from src.features.utils import load_and_merge pandarallel.initialize() check_env_vars() CONFIG = read_config(config_path=os.getenv('CONFIG_PATH')) neptune.init(project_qualified_name=CONFIG.project) RAW_DATA_PATH = CONFIG.data.raw_data_path FEATURES_DATA_PATH = CONFIG.data.features_data_path FEATURE_NAME = 'v1' NROWS = None def _split_email(x, colname): if type(x) == float and np.isnan(x): email_first, email_rest = None, None else: split = x.split('.') email_first = split[0]
#!/usr/bin/env python3 import json import src.utils import sys from src.ircclient import IRCClient from src.discordclient import DiscordClient from src import utils config_file = sys.argv[1] if len(sys.argv) == 2 else None settings = utils.read_config(config_file) settings['irc']['master_bot'] = True discord_client = DiscordClient(settings) irc_client = IRCClient(settings) discord_client.set_irc(irc_client) irc_client.set_discord(discord_client) irc_client.h_run() discord_client.h_run()