コード例 #1
0
    def __init__(self, threshold=0.5, modelpath=''):
        self.threshold = threshold
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        morph_extractor = MorphologyExtractor(sparse=True)
        self.pipeline = SparseUnion([
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ]))
        ])
コード例 #2
0
    def __init__(self, modelpath=''):
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        self.class_type = CategoricalDtype(
            categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE',
                        'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME',
                        'TRANS', 'DASH'])

        morph_extractor = MorphologyExtractor(sparse=True, multi_words=True)
        self.pipeline = SparseUnion([
            ('class', Pipeline([
                ('select', ItemSelector('class')),
                ('codes', ToCategoryCodes(self.class_type)),
                ('reshape', Reshape2d()),
                ('onehot', OneHotEncoder(n_values=len(self.class_type.categories), sparse=True, dtype=np.uint8))
            ])),
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev_prev', Pipeline([
                ('select', ItemSelector('prev_prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next_next', Pipeline([
                ('select', ItemSelector('next_next')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
        ])
        self.case_extractor = CaseExtractor(multi_words=True)
コード例 #3
0
    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        trans_ixs = X[(X['before'].str.match(ENG_REGEXP))
                      & (X['class'] == 'TRANS')].index
        X_data = StringToChar(X_MAX_LEN, to_coo=True).fit_transform(
            X.loc[trans_ixs, 'before'].str.lower()).tocsr()

        predictions = np.argmax(self.model.predict(
            vectorize_data(X_data, ENG_INDEXES)),
                                axis=2)
        del X_data
        sequences = []
        for prediction in tqdm(predictions,
                               f'{self.__class__.__name__} transform stage 1'):
            sequences.append([RUS_CHARS[ix] for ix in prediction])
        del predictions
        strs_predict = int_to_str(np.array(sequences))
        del sequences
        translit = [
            ' '.join([c + '_trans' for c in str]) for str in tqdm(
                strs_predict, f'{self.__class__.__name__} transform stage 2')
        ]
        del strs_predict

        if 'after' in X.columns:
            return X.assign(after=X['after'].combine_first(
                pd.Series(translit, index=trans_ixs)))
        else:
            return X.assign(
                after=pd.Series(translit, index=trans_ixs, name='after'))
コード例 #4
0
    def __init__(self, modelpath=''):
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        self.class_type = CategoricalDtype(
            categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE',
                        'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME',
                        'TRANS', 'DASH'])

        morph_extractor = MorphologyExtractor(sparse=True)
        self.pipeline = SparseUnion([
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ]))
        ])
コード例 #5
0
#                   only_diff=True,
#                   input_path=r'../input/norm_challenge_ru/ru_with_types')\
#      .fillna('')
df = df[~(df['before'] == df['after']) & (df['after'].str.contains('_trans'))]
df['after'] = df['after'].str.replace('_trans', '').str.replace(' ', '')
df['before'] = df['before'].str.lower()
print('drop {0} urls from strings'.format(
    len(df[df['before'].str.contains('\.')].index)))
df = df[~df['before'].str.contains('\.')]
#df = df.sample(3000000)
print(df.info())

X_max_len = 32  #min(32, df['before'].str.len().max())
y_max_len = 32  #min(32, df['after'].str.len().max())

X_data = StringToChar(X_max_len,
                      to_coo=True).fit_transform(df['before']).tocsr()
y_data = StringToChar(y_max_len,
                      to_coo=True).fit_transform(df['after']).tocsr()
del df
gc.collect()

X_ix_to_char = [0] + sorted(set(X_data.data))
X_char_to_ix = {chr: ix for ix, chr in enumerate(X_ix_to_char)}
y_ix_to_char = [0] + sorted(set(y_data.data))
y_char_to_ix = {chr: ix for ix, chr in enumerate(y_ix_to_char)}
print(X_ix_to_char)
print(y_ix_to_char)

X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                    y_data,
                                                    test_size=0.1)
コード例 #6
0
morph_extractor = MorphologyExtractor(sparse=True, multi_words=True)
before_pipeline = SparseUnion([
    ('class',
     Pipeline([('select', ItemSelector('class')),
               ('codes', ToCategoryCodes(class_type)),
               ('reshape', Reshape2d()),
               ('onehot',
                OneHotEncoder(n_values=len(class_type.categories),
                              sparse=True,
                              dtype=np.uint8))])),
    ('orig',
     Pipeline([
         ('select', ItemSelector('before')),
         ('features',
          SparseUnion([
              ('char', StringToChar(10, to_coo=True)),
              ('ctx', morph_extractor),
          ])),
     ])),
    ('prev_prev',
     Pipeline([
         ('select', ItemSelector('prev_prev')),
         ('features',
          SparseUnion([
              ('char', StringToChar(-5, to_coo=True)),
              ('ctx', morph_extractor),
          ])),
     ])),
    ('prev',
     Pipeline([
         ('select', ItemSelector('prev')),