def train(X, Y, valX=None, valY=None, testX=None, testY=None): feats = formatFeatures(X, Y) trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in pycrfInstances(feats, labeled=True): trainer.append(xseq, yseq) os_handle, tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="crf_temp") trainer.train(tmp_file) model = '' with open(tmp_file, 'rb') as f: model = f.read() os.close(os_handle) os.remove(tmp_file) scores = {} train_pred = predict(model, X) train_stats = compute_performance_stats('train', train_pred, Y) scores['train'] = train_stats if valX: val_pred = predict(model, valX) val_stats = compute_performance_stats('dev', val_pred, valY) scores['dev'] = val_stats if testX: test_pred = predict(model, testX) test_stats = compute_performance_stats('test', test_pred, testY) scores['test'] = test_stats scores['hyperparams'] = {} enabled_mods = enabledModules() for module, enabled in enabled_mods.items(): e = bool(enabled) scores['hyperparams'][module] = e return model, scores
def train(X, Y, val_X=None, val_Y=None, test_X=None, test_Y=None): ''' train() Train a Conditional Random Field for sequence tagging. @param X. List of sparse-matrix sequences. Each sequence is one sentence. @param Y. List of sequence tags. Each sequence is the sentence's per-token tags. @param val_X. More X data, but a heldout dev set. @param val_Y. More Y data, but a heldout dev set. @return A tuple of encoded parameter weights and hyperparameters for predicting. ''' # Sanity Check detection: features & label #with open('a','w') as f: # for xline,yline in zip(X,Y): # for x,y in zip(xline,yline): # print >>f, y, '\t', x.nonzero()[1][0] # print >>f # Format features fot crfsuite feats = format_features(X, Y) # Create a Trainer object. trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in pycrf_instances(feats, labeled=True): trainer.append(xseq, yseq) # Train the model os_handle, tmp_file = tempfile.mkstemp(dir='/tmp', suffix="crf_temp") trainer.train(tmp_file) # Read the trained model into a string (so it can be pickled) model = '' with open(tmp_file, 'rb') as f: model = f.read() os.close(os_handle) # Remove the temporary file os.remove(tmp_file) ###################################################################### # information about fitting the model scores = {} # how well does the model fir the training data? train_pred = predict(model, X) train_stats = compute_performance_stats('train', train_pred, Y) scores['train'] = train_stats if val_X: val_pred = predict(model, val_X) val_stats = compute_performance_stats('dev', val_pred, val_Y) scores['dev'] = val_stats if test_X: test_pred = predict(model, test_X) test_stats = compute_performance_stats('test', test_pred, test_Y) scores['test'] = test_stats # keep track of which external modules were used for building this model! scores['hyperparams'] = {} enabled_mods = enabled_modules() for module, enabled in enabled_mods.items(): e = bool(enabled) scores['hyperparams'][module] = e return model, scores
def train(X, Y, val_X=None, val_Y=None, test_X=None, test_Y=None): ''' train() Train a Conditional Random Field for sequence tagging. @param X. List of sparse-matrix sequences. Each sequence is one sentence. @param Y. List of sequence tags. Each sequence is the sentence's per-token tags. @param val_X. More X data, but a heldout dev set. @param val_Y. More Y data, but a heldout dev set. @return A tuple of encoded parameter weights and hyperparameters for predicting. ''' # Sanity Check detection: features & label #with open('a','w') as f: # for xline,yline in zip(X,Y): # for x,y in zip(xline,yline): # print >>f, y, '\t', x.nonzero()[1][0] # print >>f # Format features fot crfsuite feats = format_features(X,Y) # print(feats)# [... ,'' ,t12193=1\t12199=1', ...] # Create a Trainer object. trainer = pycrfsuite.Trainer(verbose=False) trainer.select(algorithm= 'pa') # algorithm:{‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’} # print(trainer.params()) # ['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch'] # print(trainer.get_params()) ''' {'feature.minfreq': 0.0, 'feature.possible_states': False, 'feature.possible_transitions': False, 'type': 1, 'c': 1.0, 'error_sensitive': True, 'averaging': True, 'max_iterations': 100, 'epsilon': 0.0} ''' for xseq, yseq in pycrf_instances(feats, labeled=True): trainer.append(xseq, yseq) # Train the model os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="crf_temp") #how is tmp_file created? #tmp_dir = cliner_dir/data/tmp # trainer.train(tmp_file)# save temporary file to C:\Users\Anak\PycharmProjects\CliNER\data\tmp\tmp02nmns01crf_temp # print(trainer.logparser.last_iteration) #{'num': 54, 'scores': {}, 'loss': 49.276289, 'feature_norm': 5.78118, 'error_norm': 0.041626, 'active_features': 16870, 'linesearch_trials': 1, 'linesearch_step': 1.0, 'time': 0.003} # Read the trained model into a string (so it can be pickled) model = '' with open(tmp_file, 'rb') as f: model = f.read() os.close(os_handle) # print(model) # b'x01\x00\x00\x00\xd1D\x00\x00\x01\x00\x00\x00\xd2D\x00\x00\x01\x00\x00\x00\xd3D\x00 # print(X) ''' [<2x12206 sparse matrix of type '<class 'numpy.float64'>' with 70 stored elements in Compressed Sparse Row format>, <9x12206 sparse matrix of type '<class 'numpy.float64'>' ''' # Remove the temporary file os.remove(tmp_file) ###################################################################### # information about fitting the model scores = {} # how well does the model fit the training data? train_pred = predict(model, X) # ANAK train_stats = compute_performance_stats('train', train_pred, Y) scores['train'] = train_stats if val_X: val_pred = predict(model, val_X) val_stats = compute_performance_stats('dev', val_pred, val_Y) scores['dev'] = val_stats if test_X: test_pred = predict(model, test_X) test_stats = compute_performance_stats('test', test_pred, test_Y) scores['test'] = test_stats # keep track of which external modules were used for building this model! scores['hyperparams'] = {} enabled_mods = enabled_modules() for module,enabled in enabled_mods.items(): e = bool(enabled) scores['hyperparams'][module] = e # print(len(scores)) # 3 return model, scores
def train(X, Y, val_X=None, val_Y=None): ''' train() Train a Conditional Random Field for sequence tagging. @param X. List of sparse-matrix sequences. Each sequence is one sentence. @param Y. List of sequence tags. Each sequence is the sentence's per-token tags. @param val_X. More X data, but a heldout dev set. @param val_Y. More Y data, but a heldout dev set. @return A tuple of encoded parameter weights and hyperparameters for predicting. ''' # Sanity Check detection: features & label #with open('a','w') as f: # for xline,yline in zip(X,Y): # for x,y in zip(xline,yline): # print >>f, y, '\t', x.nonzero()[1][0] # print >>f # Format features fot crfsuite feats = format_features(X,Y) # Create a Trainer object. trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in pycrf_instances(feats, labeled=True): trainer.append(xseq, yseq) # Train the model os_handle,tmp_file = tempfile.mkstemp(dir='/tmp', suffix="crf_temp") trainer.train(tmp_file) # Read the trained model into a string (so it can be pickled) model = '' with open(tmp_file, 'r') as f: model = f.read() os.close(os_handle) # Remove the temporary file os.remove(tmp_file) ###################################################################### # information about fitting the model scores = {} # how well does the model fir the training data? train_pred = predict(model, X) train_stats = compute_performance_stats('train', train_pred, Y) scores['train'] = train_stats if val_X: val_pred = predict(model, val_X) val_stats = compute_performance_stats('dev', val_pred, val_Y) scores['dev'] = val_stats # keep track of which external modules were used for building this model! scores['hyperparams'] = {} enabled_mods = enabled_modules() for module,enabled in enabled_mods.items(): e = bool(enabled) scores['hyperparams'][module] = e return model, scores