Ejemplo n.º 1
0
    def __init__(self, **kwargs):
        """
        set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        """
        FeatureDefinition.__init__(self)

        nbTypes = self._getTypeNumber(kwargs)

        block_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
            ])
        grid_line_transformer = GridLine_NodeTransformer()

        self._node_transformer = TransformerListByType(
            [block_transformer, grid_line_transformer])

        edge_BB_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                ("1hot",
                 Pipeline([('1hot',
                            Edge1HotFeatures(PageNumberSimpleSequenciality()))
                           ])),
                ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])),
                (
                    "numerical",
                    Pipeline([
                        ('selector', EdgeNumericalSelector()),
                        #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('numerical',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ]))
            ])
        edge_BL_transformer = Block2GridLine_EdgeTransformer()
        edge_LL_transformer = GridLine2GridLine_EdgeTransformer()
        self._edge_transformer = TransformerListByType([
            edge_BB_transformer,
            edge_BL_transformer,
            edge_BL_transformer,  # useless but required
            edge_LL_transformer
        ])

        self.tfidfNodeTextVectorizer = None  #tdifNodeTextVectorizer