Exemple #1
0
def cross_validate(train_files, test_files, model_name, exp, kfolds):
    # Complete args
    model_name = model_name.lower() if model_name else input(
        'model_name=?').lower()
    exp_num = 'exp_'
    n = str(exp) if exp else input('exp_?')
    exp_num += n

    logger.info('---------- Cross validation {} on {} start ----------'.format(
        model_name, exp_num))

    # Load data
    logger.info('Loading training set: {}'.format(list(train_files)))
    train_df = load_corpus(list(train_files))
    if test_files != ():
        logger.info('Loading test set: {}'.format(list(test_files)))
        test_df = load_corpus(list(test_files))
    else:
        test_df = None

    # Load configs
    cfg_path = get_envar('CONFIG_PATH') + '/' + get_envar('BASE_CONFIG')
    logger.info('Loading base_configs from {}'.format(cfg_path))
    base_configs = read_config(cfg_path, obj_view=True)

    logger.info('Loading exp_configs on {} from {}'.format(
        exp_num, base_configs.exp_configs.path))
    exp_configs = read_config(base_configs.exp_configs.path,
                              obj_view=False)[exp_num]

    description = exp_configs['description']
    hyparams = exp_configs['hyperparams']
    logger.info('Experiment description: {}'.format(description.strip()))
    logger.info('Hyperparams: {}'.format(hyparams))

    wdir = base_configs.model.savepath + get_timestamp() + '/'

    # CV
    kf = KFold(n_splits=kfolds, shuffle=True, random_state=42)

    cv = {}

    for k, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        logger.info(f'-- Cross validation split {k+1} --')
        rec = train_validate(model_name, hyparams, train_df.iloc[train_idx],
                             train_df.iloc[val_idx], test_df)
        cv.update({f'CV_{k+1}': rec})

    (_, cv_val, cv_test), df = best_scores(cv, complete=False)
    logger.info(
        f'**CV RESULTS** val_acc3={cv_val:.2%} test_acc3={cv_test:.2%}')
    df.to_clipboard()
    logger.info(f'CV details copied to clipboard \n{df}')
    logger.info('---------- Cross validation {} on {} end ----------'.format(
        model_name, exp_num))
Exemple #2
0
    def test_read_conf(self):
        config = read_config(self.config_path)
        sections = [x for x in config.keys()]
        for x in ['general', 'scraping']:
            with self.subTest(x=x):
                self.assertTrue(x in sections)

        # test general sections
        subsections = [x for x in config['general'].keys()]
        for x in ['db_uri']:
            with self.subTest(x=x):
                self.assertTrue(x in subsections)

        # test scraping sections
        subsections = [x for x in config['scraping'].keys()]
        for x in [
                'years_range', 'folder_images', 'folder_thumbnails',
                'converter', 'n_proc'
        ]:
            with self.subTest(x=x):
                self.assertTrue(x in subsections)

        self.assertTrue(isinstance(config['scraping']['years_range'], list))

        self.assertTrue(isinstance(config['scraping']['n_proc'], int))

        self.assertTrue(len(config['scraping']['years_range']) == 2)
Exemple #3
0
 def __init__(self, lexicon_manager=None, x_col='SENT', y_col='CLS', asp_col='ASP'):
     self.x_col = x_col
     self.y_col = y_col
     self.asp_col = asp_col
     self.lm = lexicon_manager
     self.configs = read_config(get_envar('CONFIG_PATH')+'/'+get_envar('BASE_CONFIG'), obj_view=True)
     self.__initialize()
Exemple #4
0
    def __init__(self, lx_path=None, usecol=-1, lx_size=-1, append_neg=False):
        """

        Parameters
        ----------
        lx_path : str
            path to lexicon table without '.csv' extension
        usecol : int () or list (list of column names)
            which lexicons to use
        """
        self.usecol = usecol
        self.lx_size = lx_size
        self.append_neg = append_neg
        if lx_path is None:
            configs = read_config(get_envar('CONFIG_PATH') + '/' +
                                  get_envar('BASE_CONFIG'),
                                  obj_view=True)
            self.lx_path = configs.lexicon_table.path + '.csv'
            self.usecol = configs.lexicon_table.usecol
            self.lx_size = configs.lexicon_table.lx_size
            self.append_neg = configs.lexicon_table.append_neg
        else:
            self.lx_path = lx_path + '.csv'
        self.__initialize()
Exemple #5
0
 def __init__(self, path_config='./config/production.conf'):
     path_config = os.getenv('configapi')
     self.config = read_config(path_config)
     self.db = get_db(self.config['general']['db_uri'])
Exemple #6
0
def train(train_files, val_files, test_files, model_name, exp):
    # Complete args
    model_name = model_name.lower() if model_name else input(
        'model_name=?').lower()
    exp_num = 'exp_'
    n = str(exp) if exp else input('exp_?')
    exp_num += n

    logger.info('---------- Training {} on {} start ----------'.format(
        model_name, exp_num))

    # Load data
    logger.info('Loading training set: {}'.format(list(train_files)))
    train_df = load_corpus(list(train_files))
    if val_files != ():
        logger.info('Loading validation set: {}'.format(list(val_files)))
        val_df = load_corpus(list(val_files))
    else:
        val_df = None
    if test_files != ():
        logger.info('Loading test set: {}'.format(list(test_files)))
        test_df = load_corpus(list(test_files))
    else:
        test_df = None

    # Load configs
    cfg_path = get_envar('CONFIG_PATH') + '/' + get_envar('BASE_CONFIG')
    logger.info('Loading base_configs from {}'.format(cfg_path))
    base_configs = read_config(cfg_path, obj_view=True)

    logger.info('Loading exp_configs on {} from {}'.format(
        exp_num, base_configs.exp_configs.path))
    exp_configs = read_config(base_configs.exp_configs.path,
                              obj_view=False)[exp_num]

    description = exp_configs['description']
    hyparams = exp_configs['hyperparams']
    logger.info('Experiment description: {}'.format(description.strip()))
    logger.info('Hyperparams: {}'.format(hyparams))

    wdir = base_configs.model.savepath + get_timestamp() + '/'

    # Build Model
    lm = LexiconManager()
    dm = AbsaDataManager(lexicon_manager=lm)
    model = VALID_MODELS[model_name.lower()]
    model = model(datamanager=dm, parameters=hyparams)

    # Train
    model.train(train_df, val_df, test_df)

    # Predict and score on test
    if test_df is not None:
        _, _, loss_, acc3_ = model.score(test_df)
        logger.info('Final score on test set: '
                    'test_loss={loss:.4f} ' \
                    'test_acc3={acc:.2%}'\
                    .format(loss=loss_, acc=acc3_))

    # Save model
    model.save(wdir)

    # Close tf.Session, not really necessary but... anyway
    model.close_session()

    logger.info('---------- Training {} on {} end ----------'.format(
        model_name, exp_num))
Exemple #7
0
import neptune
from neptunecontrib.versioning.data import log_data_version
from neptunecontrib.api.utils import get_filepaths
from category_encoders import OrdinalEncoder
from pandarallel import pandarallel
import pandas as pd
import numpy as np

from src.features.const import ID_COLS, V1_COLS, V1_CAT_COLS
from src.utils import read_config, check_env_vars
from src.features.utils import load_and_merge

pandarallel.initialize()
check_env_vars()
CONFIG = read_config(config_path=os.getenv('CONFIG_PATH'))

neptune.init(project_qualified_name=CONFIG.project)

RAW_DATA_PATH = CONFIG.data.raw_data_path
FEATURES_DATA_PATH = CONFIG.data.features_data_path
FEATURE_NAME = 'v1'
NROWS = None


def _split_email(x, colname):
    if type(x) == float and np.isnan(x):
        email_first, email_rest = None, None
    else:
        split = x.split('.')
        email_first = split[0]
Exemple #8
0
#!/usr/bin/env python3

import json
import src.utils
import sys

from src.ircclient import IRCClient
from src.discordclient import DiscordClient
from src import utils

config_file = sys.argv[1] if len(sys.argv) == 2 else None
settings = utils.read_config(config_file)

settings['irc']['master_bot'] = True
discord_client = DiscordClient(settings)
irc_client = IRCClient(settings)

discord_client.set_irc(irc_client)
irc_client.set_discord(discord_client)

irc_client.h_run()
discord_client.h_run()