Ejemplo n.º 1
0
class NikkeiFastText:
    path_to_model = os.path.join(CACHE_DIR, 'fasttext.model')
    logger = get_logger('kaggle-days.fasttext', log_level='INFO')
    cache_model = None

    @classmethod
    def load_model(cls):
        if cls.cache_model is not None:
            cls.logger.info('return from cache')
            return cls.cache_model

        if os.path.exists(cls.path_to_model):
            cls.logger.info('model already created. load from disk.')
            model = FastText.load(cls.path_to_model)
        else:
            with timer(cls.logger, format_str='create fasttext: ' + '{:.3f}'):
                model = cls.create_fast_model()
            model.save(cls.path_to_model)

        cls.cache_model = model
        return model

    @classmethod
    def create_fast_model(cls):
        kiji_df = read_kiji()

        main_text = kiji_df['title'].fillna('') + kiji_df['title2'].fillna(
            '') + kiji_df['title3'].fillna('') + kiji_df['body'].fillna('')
        main_text = main_text.values
        main_text = [normalize_neologd(d) for d in main_text]
        parser = DocumentParser()
        parsed_docs = [parser.call(d) for d in main_text]
        model = FastText(parsed_docs, size=128, workers=6, iter=10)
        return model
Ejemplo n.º 2
0
 def __init__(self,
              to=None,
              keys: Union[List[str], None] = None,
              logger=None,
              datafame_backend=None):
     self.to = to
     self.keys = [] if keys is None else keys
     self.logger = get_logger(__name__)
     self.dataframe_backend = get_dataframe_backend() if datafame_backend is None else datafame_backend
Ejemplo n.º 3
0
    def __init__(self, to=None, mark_filename='metrics.json', **kwargs):
        if to is not None:
            to = os.path.abspath(to)
        super(LocalExperimentBackend, self).__init__(to, **kwargs)

        self.mark_filename = mark_filename

        if self.can_save:
            os.makedirs(self.output_dir, exist_ok=True)

        logger_name = get_logger_name(to, keys=self.keys)
        self.logger = get_logger(name=logger_name,
                                 output_file=self.logging_path,
                                 format_str='%(name)-30s: %(levelname)-8s %(message)s')
        self.logger.debug('experiment output is {}'.format(self.output_dir))
        self.logger.debug('logger name: {}'.format(logger_name))
Ejemplo n.º 4
0
from sklearn.metrics import check_scoring
from sklearn.metrics._scorer import _BaseScorer, SCORERS
from sklearn.model_selection import check_cv
from sklearn.preprocessing import LabelEncoder

from vivid.backends.experiments import ExperimentBackend
from vivid.core import AbstractEvaluation, BaseBlock, SimpleEvaluation
from vivid.sklearn_extend import PrePostProcessModel
from vivid.utils import get_logger
from .evaluations import FeatureImportanceReport, MetricReport, curve_figure_reports
from .evaluations import ConfusionMatrixReport

from vivid.metrics import binary_metrics, multiclass_metrics, regression_metrics
from .utils import to_pretty_lines

logger = get_logger(__name__)


class EstimatorMixin:
    is_estimator = True


def _get_default_model_evaluations(
        evaluations: Union[None, List[AbstractEvaluation]]):
    if isinstance(evaluations, list):
        return evaluations
    return [
        FeatureImportanceReport(),
        MetricReport(), *curve_figure_reports(),
        SimpleEvaluation(),
        ConfusionMatrixReport()
Ejemplo n.º 5
0
import joblib
import numpy as np
import pandas as pd
from anemone.embedding import SWEM
from anemone.preprocess import normalize_neologd
from anemone.preprocess.wakati import DocumentParser
from gensim.models import FastText
from sklearn.decomposition import PCA
from vivid.featureset import AbstractMergeAtom
from vivid.utils import timer, get_logger

from kaggle_days.dataset import read_data
from kaggle_days.dataset import read_kiji
from kaggle_days.env import CACHE_DIR

logger = get_logger('kaggle-days.kiji', log_level='INFO')


def safe_normalize(d):
    try:
        return normalize_neologd(d)
    except:
        return None


class NikkeiFastText:
    path_to_model = os.path.join(CACHE_DIR, 'fasttext.model')
    logger = get_logger('kaggle-days.fasttext', log_level='INFO')
    cache_model = None

    @classmethod