from os.path import dirname, join import sys from languageflow.flow import Flow from languageflow.model import Model from languageflow.transformer.count import CountVectorizer from sklearn.svm import SVC from languageflow.validation.validation import TrainTestSplitValidation from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import MultiLabelBinarizer from load_data import load_dataset if __name__ == '__main__': data_file = join(dirname(dirname(dirname(dirname(__file__)))), "data", "fb_bank", "corpus", "train.xlsx") X, y = load_dataset(data_file) flow = Flow() flow.data(X, y) transformer = CountVectorizer(ngram_range=(1, 2), max_df=0.5, min_df=8) flow.transform(MultiLabelBinarizer()) flow.transform(transformer) flow.add_model(Model(OneVsRestClassifier(SVC(kernel='linear')), "SVC")) flow.set_validation(TrainTestSplitValidation(test_size=0.1)) # flow.train() flow.export(model_name="SVC", export_folder="model")
# =========================================================================# template = [ "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower", "T[0].istitle", "T[-1].istitle", "T[1].istitle", "T[-2].istitle", "T[2].istitle", # word unigram and bigram "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]", "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]", # pos unigram and bigram "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]", "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]", # ner "T[-3][3]", "T[-2][3]", "T[-1][3]", ] transformer = TaggedTransformer(template) flow.transform(transformer) # =========================================================================# # Models # =========================================================================# crf_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True } flow.add_model(Model(CRF(params=crf_params), "CRF")) # =========================================================================# # Evaluation