def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) nbTypes = self._getTypeNumber(kwargs) print("BETTER FEATURES") block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) grid_line_transformer = GridLine_NodeTransformer_v2() self._node_transformer = TransformerListByType([block_transformer, grid_line_transformer]) edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) edge_BL_transformer = Block2GridLine_EdgeTransformer() edge_LL_transformer = GridLine2GridLine_EdgeTransformer() self._edge_transformer = TransformerListByType([edge_BB_transformer, edge_BL_transformer, edge_BL_transformer, # useless but required edge_LL_transformer ]) self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self): FeatureDefinition.__init__(self) # self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc # self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc # tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node # , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6) # , dtype=np.float64) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] edge_transformer = FeatureUnion( lEdgeFeature ) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
def __init__(self, **kwargs): FeatureDefinition.__init__(self, **kwargs) nbTypes = self._getTypeNumber(kwargs) node_transformer = TransformerListByType([ FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) ]) for _i in range(nbTypes) ]) edge_transformer = TransformerListByType([ FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) ] ) for _i in range(nbTypes*nbTypes) ]) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer
def __init__(self, n_tfidf_node=None, t_ngrams_node=None, b_tfidf_node_lc=None, n_tfidf_edge=None, t_ngrams_edge=None, b_tfidf_edge_lc=None, bMirrorPage=True, bMultiPage=True): FeatureDefinition.__init__(self) self.n_tfidf_node, self.t_ngrams_node, self.b_tfidf_node_lc = n_tfidf_node, t_ngrams_node, b_tfidf_node_lc self.n_tfidf_edge, self.t_ngrams_edge, self.b_tfidf_edge_lc = n_tfidf_edge, t_ngrams_edge, b_tfidf_edge_lc self.bMirrorPage = bMirrorPage self.bMultiPage = bMultiPage tdifNodeTextVectorizer = TfidfVectorizer( lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node, analyzer='char', ngram_range=self.t_ngrams_node #(2,6) , dtype=np.float64) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "text", Pipeline([ ('selector', NodeTransformerTextEnclosed()), # ('tfidf', TfidfVectorizer(lowercase=self.b_tfidf_node_lc, max_features=self.n_tfidf_node # , analyzer = 'char', ngram_range=self.tNODE_NGRAMS #(2,6) # , dtype=np.float64)), ( 'tfidf', tdifNodeTextVectorizer ), #we can use it separately from the pipleline once fitted ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "textlen", Pipeline([ ('selector', NodeTransformerTextLen()), ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) # , ('ocr' , Pipeline([ # ('ocr', NodeOCRFeatures()) # ]) # ) # , ('pnumre' , Pipeline([ # ('pnumre', NodePNumFeatures()) # ]) # ) # , ("doc_tfidf", Pipeline([ # ('zero', Zero2Features()) # #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf # ]) # ) ]) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality()))])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ])), ( "sourcetext0", Pipeline([ ('selector', EdgeTransformerSourceText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext0", Pipeline([ ('selector', EdgeTransformerTargetText(0, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "sourcetext1", Pipeline([ ('selector', EdgeTransformerSourceText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext1", Pipeline([ ('selector', EdgeTransformerTargetText(1, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])) ] if bMultiPage: lEdgeFeature.extend([ ( "sourcetext2", Pipeline([ ('selector', EdgeTransformerSourceText(2, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #(2,6) , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])), ( "targettext2", Pipeline([ ('selector', EdgeTransformerTargetText(2, bMirrorPage=bMirrorPage, bMultiPage=bMultiPage)), ( 'tfidf', TfidfVectorizer( lowercase=self.b_tfidf_edge_lc, max_features=self.n_tfidf_edge, analyzer='char', ngram_range=self.t_ngrams_edge #, analyzer = 'word', ngram_range=self.tEDGE_NGRAMS , dtype=np.float64)), ('todense', SparseToDense() ) #pystruct needs an array, not a sparse matrix ])) ]) edge_transformer = FeatureUnion(lEdgeFeature) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer self.tfidfNodeTextVectorizer = tdifNodeTextVectorizer
def __init__(self, nbClass=None , n_feat_node=None, t_ngrams_node=None, b_node_lc=None , n_feat_edge=None, t_ngrams_edge=None, b_edge_lc=None , n_jobs=1): FeatureDefinition.__init__(self) assert nbClass, "Error: indicate the number of classes" self.nbClass = nbClass self.n_feat_node, self.t_ngrams_node, self.b_node_lc = n_feat_node, t_ngrams_node, b_node_lc self.n_feat_edge, self.t_ngrams_edge, self.b_edge_lc = n_feat_edge, t_ngrams_edge, b_edge_lc # tdifNodeTextVectorizer = TfidfVectorizer(lowercase=self.b_node_lc, max_features=self.n_feat_node # , analyzer = 'char', ngram_range=self.t_ngrams_node #(2,6) # , dtype=np.float64) """ - loading pre-computed data from: CV_5/model_A_fold_1_transf.pkl no such file : CV_5/model_A_fold_1_transf.pkl Traceback (most recent call last): File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_GTBooks_5labels.py", line 216, in <module> oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm) File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 481, in _nfold_RunFoldFromDisk oReport = self._nfold_RunFold(iFold, ts_trn, lFilename_trn, train_index, test_index, bWarm=bWarm) File "/opt/project/read/jl_git/TranskribusDU/src/tasks/DU_CRF_Task.py", line 565, in _nfold_RunFold fe.fitTranformers(lGraph_trn) File "/opt/project/read/jl_git/TranskribusDU/src/crf/FeatureDefinition_PageXml_logit_v2.py", line 141, in fitTranformers self._node_transformer.fit(lAllNode) File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/pipeline.py", line 712, in fit for _, trans, _ in self._iter()) File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__ self.retrieve() File "/opt/project/read/VIRTUALENV_PYTHON_FULL_type/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve raise exception RuntimeError: maximum recursion depth exceeded """ """ I guess this is due to the cyclic links to node's neighbours. But joblib.Parallel uses cPickle, so we cannot specialize the serialization of the Block objects. JLM April 2017 """ n_jobs_from_graph = 1 #we cannot pickl the list of graph, so n_jobs = 1 for this part! # n_jobs_NodeTransformerLogit = max(1, n_jobs/2) #half of the jobs for the NodeTransformerLogit, the rets for the others n_jobs_NodeTransformerLogit = max(1, n_jobs - 1) #we keep a ref onto it because its fitting needs not only all the nodes, but also additional info, available on the graph objects self._node_transf_logit = NodeTransformerLogit(nbClass, self.n_feat_node, self.t_ngrams_node, self.b_node_lc, n_jobs=n_jobs_NodeTransformerLogit) node_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("text", self._node_transf_logit) , ("textlen", Pipeline([ ('selector', NodeTransformerTextLen()), #v2 ('textlen', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v2 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v2 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("1hot", Pipeline([ ('1hot', Node1HotFeatures()) #does the 1-hot encoding directly ]) ) # , ('ocr' , Pipeline([ # ('ocr', NodeOCRFeatures()) # ]) # ) # , ('pnumre' , Pipeline([ # ('pnumre', NodePNumFeatures()) # ]) # ) # , ("doc", Pipeline([ # ('zero', Zero2Features()) # #THIS ONE MUST BE LAST, because it include a placeholder column for the doculent-level tfidf # ]) # ) ], n_jobs=n_jobs_from_graph) lEdgeFeature = [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([ ('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ]) ) , ("boolean", Pipeline([ ('boolean', EdgeBooleanFeatures_v2()) ]) ) , ("numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v2 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling ]) ) , ("nodetext", EdgeTransformerLogit(nbClass, self._node_transf_logit)) ] edge_transformer = FeatureUnion( lEdgeFeature, n_jobs=n_jobs_from_graph ) #return _node_transformer, _edge_transformer, tdifNodeTextVectorizer self._node_transformer = node_transformer self._edge_transformer = edge_transformer