CARDINAL_REGEXP = '\d' INPUT_MAX_LEN = 47 OUTPUT_MAX_LEN = 12 INPUT_VOCAB_SIZE = 5000 OUTPUT_VOCAB_SIZE = 257 LAYER_NUM = 2 HIDDEN_DIM = 64 EMBEDDING_DIM = 0 BATCH_SIZE = 128 LEARNING_RATE = 0.001 MEM_SIZE = 10000 NB_EPOCH = 100 DROPOUT = 0.0 df = load_train(['before', 'after', 'class'], input_path=r'../input/norm_challenge_ru').fillna('') # df = load_external(['before', 'after'], # only_diff=True, # input_path=r'../input/norm_challenge_ru/ru_with_types')\ # .fillna('') df['prev_prev'] = df['before'].shift(2) df['prev'] = df['before'].shift(1) df['next'] = df['before'].shift(-1) df['next_next'] = df['before'].shift(-2) df = df[~(df['before'] == df['after'])].fillna('') df = df[df['class'] == 'DATE'] df['before'] = df['prev_prev'].map(str) + ' '\ + df['prev'].map(str) + ' '\ + df['before'].map(lambda s: ' '.join(list(s))) + ' ' \ + df['next'].map(str) + ' ' \ + df['next_next'].map(str)
del threegramms if 'after' in X.columns: return X.assign( after=X['after'].combine_first(pd.Series(data, index=X.index))) else: return X.assign(after=data) def get_params(self): params = super(self.__class__, self).get_params() params['mean_confidence'] = self.mean_confidence return params if __name__ == '__main__': df = load_train(columns=['before', 'after'], input_path=INPUT_PATH) df['prev'] = df['before'].shift(1) df['next'] = df['before'].shift(-1) df['before'] = df['before'] df['after'] = df['after'] df = df.fillna('') print(df.info()) dt = DictNBHDTransformer(0.5) dt.fit(df.drop(['after'], axis=1), df['after']) dt.fit(df.drop(['after'], axis=1), df['after']) res_df = dt.transform(df.rename(columns={'after': 'actual'})) print('Acc', len(res_df[res_df['after'] == res_df['actual']]) / len(res_df))
import matplotlib.pyplot as plt import numpy as np import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from loaders.loading import load_train from transformers.item_selector import ItemSelector from transformers.morphology_extractor import MorphologyExtractor from transformers.sparse_union import SparseUnion from transformers.string_to_chars import StringToChar from sparse_helpers import sparse_memory_usage import gc from sklearn.metrics import accuracy_score df = load_train(['before', 'after']).fillna('') df['self'] = (df['before'] == df['after']) df['prev'] = df['before'].shift(1) df['next'] = df['before'].shift(-1) df = df.fillna('') del df['after'] print(df.info()) morph_extractor = MorphologyExtractor(sparse=True) pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor),
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer from loaders.loading import load_train from transformers.case_extractor import CaseExtractor from transformers.item_selector import ItemSelector, Reshape2d, ToCategoryCodes from transformers.morphology_extractor import MorphologyExtractor from transformers.sparse_union import SparseUnion from transformers.string_to_chars import StringToChar from sparse_helpers import sparse_memory_usage import gc from sklearn.metrics import accuracy_score from pandas.api.types import CategoricalDtype INPUT_PATH = r'../input/norm_challenge_ru' df = load_train(['before', 'after', 'class'], INPUT_PATH).fillna('') #df = load_external(['before', 'after', 'class'], # only_diff=True, # input_path=r'../input/norm_challenge_ru/ru_with_types')\ # .fillna('') df['prev_prev'] = df['before'].shift(2).fillna('') df['prev'] = df['before'].shift(1).fillna('') df['next'] = df['before'].shift(-1).fillna('') df['next_next'] = df['before'].shift(-2).fillna('') classes = frozenset([ 'CARDINAL', 'DATE', 'MEASURE', 'DECIMAL', 'MONEY', 'ORDINAL', 'FRACTION', 'TIME' ]) df = df[~(df['before'] == df['after']) & (df['class'].isin(classes))] class_type = CategoricalDtype(categories=[
# p.tag.number # число (единственное, множественное) res = pd.DataFrame(data, columns=['case', 'gender', 'number']).fillna('none') del data res['case'] = res['case'].astype(self.case_type) res['gender'] = res['gender'].astype(self.gender_type) res['number'] = res['number'].astype(self.number_type) return res if __name__ == '__main__': data = [u'В 1905 году'] + u'съешь ещё этих мягких французских булок , ДА выпей чаю брюки брючные'.split() print(data) morph = CaseExtractor() res = morph.transform(data) print(res.info()) print(res) morph.multi_words = True morph.word_rows = {} res = morph.transform(data) print(res.info()) print(res) df = load_train(columns=['after'], input_path=r'../../input/norm_challenge_ru') morph.word_rows = {} res = morph.transform(df.sample(100000)['after']) print(res.info()) print(res)