Ejemplo n.º 1
0
    def __init__(self, threshold=0.5, modelpath=''):
        self.threshold = threshold
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        morph_extractor = MorphologyExtractor(sparse=True)
        self.pipeline = SparseUnion([
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ]))
        ])
    def __init__(self, modelpath=''):
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        self.class_type = CategoricalDtype(
            categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE',
                        'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME',
                        'TRANS', 'DASH'])

        morph_extractor = MorphologyExtractor(sparse=True, multi_words=True)
        self.pipeline = SparseUnion([
            ('class', Pipeline([
                ('select', ItemSelector('class')),
                ('codes', ToCategoryCodes(self.class_type)),
                ('reshape', Reshape2d()),
                ('onehot', OneHotEncoder(n_values=len(self.class_type.categories), sparse=True, dtype=np.uint8))
            ])),
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev_prev', Pipeline([
                ('select', ItemSelector('prev_prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next_next', Pipeline([
                ('select', ItemSelector('next_next')),
                ('features', SparseUnion([
                    ('char', StringToChar(-5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
        ])
        self.case_extractor = CaseExtractor(multi_words=True)
    def __init__(self, modelpath=''):
        self.modelpath = modelpath
        self.model = None
        if self.modelpath:
            self.model = xgb.Booster()
            self.model.load_model(modelpath)

        self.class_type = CategoricalDtype(
            categories=['PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL', 'MEASURE',
                        'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION', 'MONEY', 'TIME',
                        'TRANS', 'DASH'])

        morph_extractor = MorphologyExtractor(sparse=True)
        self.pipeline = SparseUnion([
            ('orig', Pipeline([
                ('select', ItemSelector('before')),
                ('features', SparseUnion([
                    ('char', StringToChar(10, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('prev', Pipeline([
                ('select', ItemSelector('prev')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ])),
            ('next', Pipeline([
                ('select', ItemSelector('next')),
                ('features', SparseUnion([
                    ('char', StringToChar(5, to_coo=True)),
                    ('ctx', morph_extractor),
                ])),
            ]))
        ])
            if self.category_columns:
                res = pd.get_dummies(
                    X,
                    sparse=True,
                    dummy_na=False,
                    columns=self.category_columns).to_sparse(fill_value=0)
            else:
                res = pd.get_dummies(X, sparse=True,
                                     dummy_na=False).to_sparse(fill_value=0)
            conv = res.select_dtypes(exclude=[np.number]).astype(np.float16)
            res[conv.columns] = conv
            return res


if __name__ == '__main__':
    data = [
        u'в 1905 году'
    ] + u'Определение частей речи работает не так как задумывалось'.split()
    print(data)
    morph = MorphologyExtractor().transform(data)
    print(morph.info())
    print(morph.density)

    res = PandasDummies(category_columns=[
        'pos', 'animacy', 'aspect', 'case', 'gender', 'involvement', 'mood',
        'number', 'person', 'tense', 'transitivity'
    ]).fit_transform(morph)
    print(res)
    print(res.info())
    print(res.density)
Ejemplo n.º 5
0
df['prev'] = df['before'].shift(1).fillna('')
df['next'] = df['before'].shift(-1).fillna('')
df['next_next'] = df['before'].shift(-2).fillna('')
classes = frozenset([
    'CARDINAL', 'DATE', 'MEASURE', 'DECIMAL', 'MONEY', 'ORDINAL', 'FRACTION',
    'TIME'
])
df = df[~(df['before'] == df['after']) & (df['class'].isin(classes))]
class_type = CategoricalDtype(categories=[
    'PLAIN', 'DATE', 'PUNCT', 'ORDINAL', 'VERBATIM', 'LETTERS', 'CARDINAL',
    'MEASURE', 'TELEPHONE', 'ELECTRONIC', 'DECIMAL', 'DIGIT', 'FRACTION',
    'MONEY', 'TIME', 'TRANS', 'DASH'
])
print(df.info())

morph_extractor = MorphologyExtractor(sparse=True, multi_words=True)
before_pipeline = SparseUnion([
    ('class',
     Pipeline([('select', ItemSelector('class')),
               ('codes', ToCategoryCodes(class_type)),
               ('reshape', Reshape2d()),
               ('onehot',
                OneHotEncoder(n_values=len(class_type.categories),
                              sparse=True,
                              dtype=np.uint8))])),
    ('orig',
     Pipeline([
         ('select', ItemSelector('before')),
         ('features',
          SparseUnion([
              ('char', StringToChar(10, to_coo=True)),
from transformers.morphology_extractor import MorphologyExtractor
from transformers.sparse_union import SparseUnion
from transformers.string_to_chars import StringToChar
from sparse_helpers import sparse_memory_usage
import gc
from sklearn.metrics import accuracy_score

df = load_train(['before', 'after']).fillna('')
df['self'] = (df['before'] == df['after'])
df['prev'] = df['before'].shift(1)
df['next'] = df['before'].shift(-1)
df = df.fillna('')
del df['after']
print(df.info())

morph_extractor = MorphologyExtractor(sparse=True)
pipeline = SparseUnion([
    ('orig',
     Pipeline([
         ('select', ItemSelector('before')),
         ('features',
          SparseUnion([
              ('char', StringToChar(10, to_coo=True)),
              ('ctx', morph_extractor),
          ])),
     ])),
    ('prev',
     Pipeline([
         ('select', ItemSelector('prev')),
         ('features',
          SparseUnion([
Ejemplo n.º 7
0
            for col in tqdm(self.columns,
                            f'{self.__class__.__name__} transform'):
                res.append(LabelEncoder().fit_transform(X[col]))
        # else:
        #     if isinstance(X, np.ndarray):
        #         for col in X.T:
        #             res(LabelEncoder().fit_transform(np.array(col)))
        return pd.DataFrame(np.column_stack(res), columns=self.columns)


if __name__ == '__main__':
    np_array1d = np.array([('s', 'dfg'), ('f', 's'), ('H', 'h')],
                          dtype=[('col1', 'O'), ('col2', 'O')])
    print(np_array1d)
    res = MultiLabelEncoder(['col1', 'col2']).transform(np_array1d)
    print(res, flush=True)

    from transformers.morphology_extractor import MorphologyExtractor
    data = [
        u'в 1905 году'
    ] + u'Определение частей речи работает не так как задумывалось в ПП'.split(
    )
    print(data, flush=True)
    context = MorphologyExtractor().transform(data)
    print(context, flush=True)
    res = MultiLabelEncoder(('is_first_upper', 'is_upper',
                             'pos', 'animacy', 'aspect', 'case', 'gender', 'mood', 'number', 'person',
                             'tense', 'transitivity', 'voice'))\
        .transform(context)
    print(res)