def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) nbTypes = self._getTypeNumber(kwargs) block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) ]) grid_line_transformer = GridLine_NodeTransformer() self._node_transformer = TransformerListByType( [block_transformer, grid_line_transformer]) edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])) ]) edge_BL_transformer = Block2GridLine_EdgeTransformer() edge_LL_transformer = GridLine2GridLine_EdgeTransformer() self._edge_transformer = TransformerListByType([ edge_BB_transformer, edge_BL_transformer, edge_BL_transformer, # useless but required edge_LL_transformer ]) self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer