class NikkeiFastText: path_to_model = os.path.join(CACHE_DIR, 'fasttext.model') logger = get_logger('kaggle-days.fasttext', log_level='INFO') cache_model = None @classmethod def load_model(cls): if cls.cache_model is not None: cls.logger.info('return from cache') return cls.cache_model if os.path.exists(cls.path_to_model): cls.logger.info('model already created. load from disk.') model = FastText.load(cls.path_to_model) else: with timer(cls.logger, format_str='create fasttext: ' + '{:.3f}'): model = cls.create_fast_model() model.save(cls.path_to_model) cls.cache_model = model return model @classmethod def create_fast_model(cls): kiji_df = read_kiji() main_text = kiji_df['title'].fillna('') + kiji_df['title2'].fillna( '') + kiji_df['title3'].fillna('') + kiji_df['body'].fillna('') main_text = main_text.values main_text = [normalize_neologd(d) for d in main_text] parser = DocumentParser() parsed_docs = [parser.call(d) for d in main_text] model = FastText(parsed_docs, size=128, workers=6, iter=10) return model
def __init__(self, to=None, keys: Union[List[str], None] = None, logger=None, datafame_backend=None): self.to = to self.keys = [] if keys is None else keys self.logger = get_logger(__name__) self.dataframe_backend = get_dataframe_backend() if datafame_backend is None else datafame_backend
def __init__(self, to=None, mark_filename='metrics.json', **kwargs): if to is not None: to = os.path.abspath(to) super(LocalExperimentBackend, self).__init__(to, **kwargs) self.mark_filename = mark_filename if self.can_save: os.makedirs(self.output_dir, exist_ok=True) logger_name = get_logger_name(to, keys=self.keys) self.logger = get_logger(name=logger_name, output_file=self.logging_path, format_str='%(name)-30s: %(levelname)-8s %(message)s') self.logger.debug('experiment output is {}'.format(self.output_dir)) self.logger.debug('logger name: {}'.format(logger_name))
from sklearn.metrics import check_scoring from sklearn.metrics._scorer import _BaseScorer, SCORERS from sklearn.model_selection import check_cv from sklearn.preprocessing import LabelEncoder from vivid.backends.experiments import ExperimentBackend from vivid.core import AbstractEvaluation, BaseBlock, SimpleEvaluation from vivid.sklearn_extend import PrePostProcessModel from vivid.utils import get_logger from .evaluations import FeatureImportanceReport, MetricReport, curve_figure_reports from .evaluations import ConfusionMatrixReport from vivid.metrics import binary_metrics, multiclass_metrics, regression_metrics from .utils import to_pretty_lines logger = get_logger(__name__) class EstimatorMixin: is_estimator = True def _get_default_model_evaluations( evaluations: Union[None, List[AbstractEvaluation]]): if isinstance(evaluations, list): return evaluations return [ FeatureImportanceReport(), MetricReport(), *curve_figure_reports(), SimpleEvaluation(), ConfusionMatrixReport()
import joblib import numpy as np import pandas as pd from anemone.embedding import SWEM from anemone.preprocess import normalize_neologd from anemone.preprocess.wakati import DocumentParser from gensim.models import FastText from sklearn.decomposition import PCA from vivid.featureset import AbstractMergeAtom from vivid.utils import timer, get_logger from kaggle_days.dataset import read_data from kaggle_days.dataset import read_kiji from kaggle_days.env import CACHE_DIR logger = get_logger('kaggle-days.kiji', log_level='INFO') def safe_normalize(d): try: return normalize_neologd(d) except: return None class NikkeiFastText: path_to_model = os.path.join(CACHE_DIR, 'fasttext.model') logger = get_logger('kaggle-days.fasttext', log_level='INFO') cache_model = None @classmethod