def __init__(self, threshold=0.5, modelpath=''): self.threshold = threshold self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) morph_extractor = MorphologyExtractor(sparse=True) self.pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])) ])
def __init__(self, modelpath=''): self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) self.class_type = CategoricalDtype( categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH']) morph_extractor = MorphologyExtractor(sparse=True, multi_words=True) self.pipeline = SparseUnion([ ('class', Pipeline([ ('select', ItemSelector('class')), ('codes', ToCategoryCodes(self.class_type)), ('reshape', Reshape2d()), ('onehot', OneHotEncoder(n_values=len(self.class_type.categories), sparse=True, dtype=np.uint8)) ])), ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev_prev', Pipeline([ ('select', ItemSelector('prev_prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next_next', Pipeline([ ('select', ItemSelector('next_next')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ]) self.case_extractor = CaseExtractor(multi_words=True)
def transform(self, X: pd.DataFrame, y=None, *args, **kwargs): trans_ixs = X[(X['before'].str.match(ENG_REGEXP)) & (X['class'] == 'TRANS')].index X_data = StringToChar(X_MAX_LEN, to_coo=True).fit_transform( X.loc[trans_ixs, 'before'].str.lower()).tocsr() predictions = np.argmax(self.model.predict( vectorize_data(X_data, ENG_INDEXES)), axis=2) del X_data sequences = [] for prediction in tqdm(predictions, f'{self.__class__.__name__} transform stage 1'): sequences.append([RUS_CHARS[ix] for ix in prediction]) del predictions strs_predict = int_to_str(np.array(sequences)) del sequences translit = [ ' '.join([c + '_trans' for c in str]) for str in tqdm( strs_predict, f'{self.__class__.__name__} transform stage 2') ] del strs_predict if 'after' in X.columns: return X.assign(after=X['after'].combine_first( pd.Series(translit, index=trans_ixs))) else: return X.assign( after=pd.Series(translit, index=trans_ixs, name='after'))
def __init__(self, modelpath=''): self.modelpath = modelpath self.model = None if self.modelpath: self.model = xgb.Booster() self.model.load_model(modelpath) self.class_type = CategoricalDtype( categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME', 'TRANS', 'DASH']) morph_extractor = MorphologyExtractor(sparse=True) self.pipeline = SparseUnion([ ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('next', Pipeline([ ('select', ItemSelector('next')), ('features', SparseUnion([ ('char', StringToChar(5, to_coo=True)), ('ctx', morph_extractor), ])), ])) ])
# only_diff=True, # input_path=r'../input/norm_challenge_ru/ru_with_types')\ # .fillna('') df = df[~(df['before'] == df['after']) & (df['after'].str.contains('_trans'))] df['after'] = df['after'].str.replace('_trans', '').str.replace(' ', '') df['before'] = df['before'].str.lower() print('drop {0} urls from strings'.format( len(df[df['before'].str.contains('\.')].index))) df = df[~df['before'].str.contains('\.')] #df = df.sample(3000000) print(df.info()) X_max_len = 32 #min(32, df['before'].str.len().max()) y_max_len = 32 #min(32, df['after'].str.len().max()) X_data = StringToChar(X_max_len, to_coo=True).fit_transform(df['before']).tocsr() y_data = StringToChar(y_max_len, to_coo=True).fit_transform(df['after']).tocsr() del df gc.collect() X_ix_to_char = [0] + sorted(set(X_data.data)) X_char_to_ix = {chr: ix for ix, chr in enumerate(X_ix_to_char)} y_ix_to_char = [0] + sorted(set(y_data.data)) y_char_to_ix = {chr: ix for ix, chr in enumerate(y_ix_to_char)} print(X_ix_to_char) print(y_ix_to_char) X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1)
morph_extractor = MorphologyExtractor(sparse=True, multi_words=True) before_pipeline = SparseUnion([ ('class', Pipeline([('select', ItemSelector('class')), ('codes', ToCategoryCodes(class_type)), ('reshape', Reshape2d()), ('onehot', OneHotEncoder(n_values=len(class_type.categories), sparse=True, dtype=np.uint8))])), ('orig', Pipeline([ ('select', ItemSelector('before')), ('features', SparseUnion([ ('char', StringToChar(10, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev_prev', Pipeline([ ('select', ItemSelector('prev_prev')), ('features', SparseUnion([ ('char', StringToChar(-5, to_coo=True)), ('ctx', morph_extractor), ])), ])), ('prev', Pipeline([ ('select', ItemSelector('prev')),