def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) nbTypes = self._getTypeNumber(kwargs) print("BETTER FEATURES") block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) grid_line_transformer = GridLine_NodeTransformer_v2() self._node_transformer = TransformerListByType([block_transformer, grid_line_transformer]) edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) edge_BL_transformer = Block2GridLine_EdgeTransformer() edge_LL_transformer = GridLine2GridLine_EdgeTransformer() self._edge_transformer = TransformerListByType([edge_BB_transformer, edge_BL_transformer, edge_BL_transformer, # useless but required edge_LL_transformer ]) self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self, **kwargs): FeatureDefinition.__init__(self, **kwargs) nbTypes = self._getTypeNumber(kwargs) node_transformer = TransformerListByType([ FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) for _i in range(nbTypes) ]) edge_transformer = TransformerListByType([ FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) for _i in range(nbTypes*nbTypes) ]) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer