# word unigram and bigram "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]", "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]", # pos unigram and bigram "T[-3][1]", "T[-2][1]", "T[-1][1]", "T[-3,-2][1]", "T[-2,-1][1]", ] transformer = TaggedTransformer(template) # ============================================================================ # # Model # ============================================================================ # crf_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True } model = Model(CRF(params=crf_params), "CRF")
"T[-1][3]", ] transformer = TaggedTransformer(template) flow.transform(transformer) # =========================================================================# # Models # =========================================================================# crf_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True } flow.add_model(Model(CRF(params=crf_params), "CRF")) # =========================================================================# # Evaluation # =========================================================================# flow.add_score('f1_chunk') flow.add_score('accuracy_chunk') flow.set_validation(TrainTestSplitValidation(test_size=0.1)) # flow.set_validation(TrainTestSplitValidation(test_size=0.3)) flow.train() # flow.save_model("CRF", filename="ner_crf_20171006_template_2.model")
from sklearn.multiclass import OneVsRestClassifier from underthesea_flow.flow import Flow from underthesea_flow.model import Model from underthesea_flow.validation.validation import TrainTestSplitValidation from sklearn.preprocessing import MultiLabelBinarizer from load_data import load_dataset from transformer import TfidfVectorizer if __name__ == '__main__': data_file = join(dirname(dirname(dirname(__file__))), "data", "fb_bank_act_3", "corpus", "train.xlsx") X, y = load_dataset(data_file) flow = Flow() flow.log_folder = "log" flow.data(X, y) transformer = TfidfVectorizer(ngram_range=(1, 2)) flow.transform(MultiLabelBinarizer()) flow.transform(transformer) flow.add_model( Model(OneVsRestClassifier(LogisticRegression()), "LogisticRegression")) # flow.set_learning_curve(0.7, 1, 0.3) flow.set_validation(TrainTestSplitValidation(test_size=0.1)) flow.train() flow.export_folder = "model" flow.export(model_name="LogisticRegression")
from sklearn.multiclass import OneVsRestClassifier from underthesea_flow.flow import Flow from underthesea_flow.model import Model from underthesea_flow.validation.validation import TrainTestSplitValidation from sklearn.preprocessing import MultiLabelBinarizer from load_data import load_dataset from transformer import TfidfVectorizer from sklearn.linear_model import SGDClassifier if __name__ == '__main__': data_file = join(dirname(dirname(dirname(__file__))), "data", "fb_bank_category", "corpus", "train.xlsx") X, y = load_dataset(data_file) flow = Flow() flow.log_folder = "log" flow.data(X, y) transformer = TfidfVectorizer(ngram_range=(1, 3)) flow.transform(MultiLabelBinarizer()) flow.transform(transformer) flow.add_model(Model(OneVsRestClassifier(SGDClassifier()), "SGD")) # flow.set_learning_curve(0.7, 1, 0.3) flow.set_validation(TrainTestSplitValidation(test_size=0.1)) flow.train() flow.export_folder = "model" flow.export(model_name="SGD")
from os.path import dirname, join from underthesea_flow.flow import Flow from underthesea_flow.model import Model from underthesea_flow.validation.validation import TrainTestSplitValidation from load_data import load_dataset from model.model_fasttext import FastTextClassifier if __name__ == '__main__': data_file = join(dirname(dirname(dirname(dirname(__file__)))), "data", "fb_bank_act_2", "corpus", "data.xlsx") X, y = load_dataset(data_file) flow = Flow() flow.log_folder = "log" flow.data(X, y) flow.add_model(Model(FastTextClassifier(), "FastText")) flow.set_validation(TrainTestSplitValidation(test_size=0.1)) # flow.validation() model_name = "FastText" model_filename = join("model", "fasttext.model") flow.train() flow.save_model(model_name="FastText", model_filename=model_filename)