Ejemplo n.º 1
0
    def __init__(self, **kwargs):
        """
        set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        """
        FeatureDefinition.__init__(self)

        nbTypes = self._getTypeNumber(kwargs)
        
        block_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                    ("xywh", Pipeline([
                                                         ('selector', NodeTransformerXYWH_v2()),
                                                         #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("neighbors", Pipeline([
                                                         ('selector', NodeTransformerNeighbors()),
                                                         #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                       )
                                    , ("1hot", Pipeline([
                                                         ('1hot', Node1HotFeatures())  #does the 1-hot encoding directly
                                                         ])
                                       )
                                      ])
        Cut_line_transformer = CutLine_NodeTransformer_v2()
        
        self._node_transformer = TransformerListByType([block_transformer, Cut_line_transformer]) 
        
        edge_BB_transformer = FeatureUnion( [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                                      ("1hot", Pipeline([
                                                         ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))
                                                         ])
                                        )
                                    , ("boolean", Pipeline([
                                                         ('boolean', EdgeBooleanFeatures_v2())
                                                         ])
                                        )
                                    , ("numerical", Pipeline([
                                                         ('selector', EdgeNumericalSelector()),
                                                         #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                                                         ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False))  #use in-place scaling
                                                         ])
                                        )
                                          ] )
        edge_BL_transformer = Block2CutLine_EdgeTransformer()
        edge_LL_transformer = CutLine2CutLine_EdgeTransformer()
        self._edge_transformer = TransformerListByType([edge_BB_transformer,
                                                  edge_BL_transformer,
                                                  edge_BL_transformer,  # useless but required
                                                  edge_LL_transformer 
                                                  ])
          
        self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
Ejemplo n.º 2
0
    def __init__(self):
        FeatureDefinition.__init__(self)

        #         self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc
        #         self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc

        #         tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node
        #                                                                                   , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6)
        #                                                                                   , dtype=np.float64)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        ('xywh',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                ("neighbors", NodeTransformerNeighbors_v2()),
                ("1hot", Node1HotFeatures())  #does the 1-hot encoding directly
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot", Edge1HotFeatures(PageNumberSimpleSequenciality())),
            ('boolean', EdgeBooleanFeatures_v2()),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    ('numerical',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ]))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None  #tdifNodeTextVectorizer
    def __init__(self):
        FeatureDefinition.__init__(self)

        node_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        ('xywh',
                         StandardScaler(copy=False,
                                        with_mean=True,
                                        with_std=True))  #use in-place scaling
                    ])),
                ("neighbors", NodeTransformerNeighbors_v2()),
                ("1hot", Node1HotFeatures())  #does the 1-hot encoding directly
            ])

        lEdgeFeature = [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
            ("1hot", Edge1HotFeatures(PageNumberSimpleSequenciality())),
            ('boolean', EdgeBooleanFeatures_v2()),
            (
                "numerical",
                Pipeline([
                    ('selector', EdgeNumericalSelector()),
                    ('numerical',
                     StandardScaler(copy=False, with_mean=True,
                                    with_std=True))  #use in-place scaling
                ]))
        ]

        edge_transformer = FeatureUnion(lEdgeFeature)

        #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        self._node_transformer = node_transformer
        self._edge_transformer = edge_transformer
        self.tfidfNodeTextVectorizer = None  #tdifNodeTextVectorizer