def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) nbTypes = self._getTypeNumber(kwargs) block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) Cut_line_transformer = CutLine_NodeTransformer_v2() self._node_transformer = TransformerListByType([block_transformer, Cut_line_transformer]) edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) edge_BL_transformer = Block2CutLine_EdgeTransformer() edge_LL_transformer = CutLine2CutLine_EdgeTransformer() self._edge_transformer = TransformerListByType([edge_BB_transformer, edge_BL_transformer, edge_BL_transformer, # useless but required edge_LL_transformer ]) self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self): FeatureDefinition.__init__(self) # self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc # self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc # tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node # , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6) # , dtype=np.float64) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ("neighbors", NodeTransformerNeighbors_v2()), ("1hot", Node1HotFeatures()) #does the 1-hot encoding directly ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Edge1HotFeatures(PageNumberSimpleSequenciality())), ('boolean', EdgeBooleanFeatures_v2()), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])) ] edge_transformer = FeatureUnion(lEdgeFeature) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self): FeatureDefinition.__init__(self) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])), ("neighbors", NodeTransformerNeighbors_v2()), ("1hot", Node1HotFeatures()) #does the 1-hot encoding directly ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Edge1HotFeatures(PageNumberSimpleSequenciality())), ('boolean', EdgeBooleanFeatures_v2()), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ])) ] edge_transformer = FeatureUnion(lEdgeFeature) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer