def test(self): corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.AIVIVN2019_SA_SAMPLE) # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) params = { "vectorizer": CountVectorizer(ngram_range=(1, 2), max_features=4000), "svc": SVC(kernel='linear', C=0.3) } classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.SVC, **params) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('tuyệt vời') classifier.predict(sentence) shutil.rmtree(tmp_model_folder) print(sentence)
def my_run(estimator__C, features__lower_pipe__tfidf__ngram_range, features__with_tone_char__ngram_range, features__remove_tone__tfidf__ngram_range): params = locals().copy() start = time.time() print(params) corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA) pipeline = Pipeline( steps=[( 'features', FeatureUnion([( 'lower_pipe', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer( ngram_range=(1, 4), norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(ngram_range=(1, 6), norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=(1, 4), norm='l2', min_df=2))]) ), ('emoticons', CountEmoticons())])), ('estimator', SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True) )]) pipeline.set_params(**params) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1, score_class_2 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) print(time.time() - start) return score['dev_score']
def my_run(estimator__C, features__lower_pipe__tfidf__max_features, features__lower_pipe__tfidf__ngram_range, features__with_tone_char__ngram_range, features__remove_tone__tfidf__ngram_range): params = locals().copy() start = time.time() print(params) corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.UTS2017_BANK_TC) pipeline = Pipeline(steps=[( 'features', FeatureUnion([('lower_pipe', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer(norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(norm='l2', min_df=2) )])), ('emoticons', CountEmoticons())]) ), ('estimator', SVC(kernel='linear', class_weight=None, verbose=True))]) pipeline.set_params(**params) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = "tmp/tmp_model" def micro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='micro') score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score) tmp_files = listdir(tmp_model_folder) for file in tmp_files: if "gitignore" in file: continue os.remove(f"{tmp_model_folder}/{file}") ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) print(f"Time: {round(time.time() - start, 2)} s") return score['dev_score']
def run(estimator, features): corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[( 'features', FeatureUnion([( 'lower_tfidf', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer( ngram_range=(1, 4), norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(ngram_range=(1, 6), norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=(1, 4), norm='l2', min_df=2))]))])), ('estimator', SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True) )]) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score) ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) return score['test_score']
def test_fasttext(self): corpus: CategorizedCorpus = DataFetcher.load_corpus( NLPData.AIVIVN2019_SA_SAMPLE) params = {"lr": 0.01, "epoch": 20, "wordNgrams": 3, "dim": 20} classifier = TextClassifier( estimator=TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT, **params) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('tuyệt vời') classifier.predict(sentence) shutil.rmtree(tmp_model_folder) print(sentence)
def my_run(features__max_df, features__ngram_range): params = locals().copy() start = time.time() print(params) from languageflow.data_fetcher import DataFetcher corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VNTC) pipeline = Pipeline(steps=[('features', TfidfVectorizer()), ('estimator', LinearSVC())]) pipeline.set_params(**params) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def micro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='micro') score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score) ex.log_scalar('dev_score', score['dev_score']) ex.log_scalar('test_score', score['test_score']) print(time.time() - start) return score['dev_score']
def test(self): corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA_SAMPLE) # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[('features', CountVectorizer(ngram_range=(1, 2), max_features=4000)), ('estimator', OneVsRestClassifier(SVC(kernel='linear', C=0.3)))] ) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline, multilabel=True) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro') score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score) print(score) classifier = TextClassifier.load(tmp_model_folder) sentence = Sentence('Dịch vụ tiện dụng quá') classifier.predict(sentence) print(sentence) shutil.rmtree(tmp_model_folder)
def remove(data): DataFetcher.remove(data)
def list(all): DataFetcher.list(all)
def download(dataset, url): DataFetcher.download_data(dataset, url)
from sklearn.svm import LinearSVC model_folder = "tmp/classification_svm_vntc" try: shutil.rmtree(model_folder) except: pass finally: os.makedirs(model_folder) tfidf__ngram_range = (1, 2) tfidf__max_df = 0.5 start = time.time() print(">>> Train VNTC Classification") corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VNTC) print("\n\n>>> Sample sentences") for s in corpus.train[:10]: print(s) pipeline = Pipeline( steps=[('features', TfidfVectorizer(ngram_range=tfidf__ngram_range, max_df=tfidf__max_df)), ('estimator', LinearSVC())]) print("\n\n>>> Start training") classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) def micro_f1_score(y_true, y_pred):
from languageflow.data import Sentence from languageflow.data_fetcher import NLPData, DataFetcher from languageflow.models.text_classifier import TextClassifier model_folder = "tmp/sentiment_svm_uts2017_bank_sa" print(f"Load model from {model_folder}") classifier = TextClassifier.load(model_folder) print(f"Model is loaded.") def predict(text): print(f"\nText: {text}") sentence = Sentence(text) classifier.predict(sentence) labels = sentence.labels print(f"Labels: {labels}") corpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA) predict( 'Bạn nên làm thẻ credit, đừng làm debit. Mình dùng thẻ debit của vcb, tết vừa rồi bị hack mất 28 triệu trong tài khoản mà đến giờ vcb đã giải quyết cho mình đâu. Bực mình!' )
try: shutil.rmtree(model_folder) except: pass finally: os.makedirs(model_folder) lower__ngram_range = (1, 3) with_tone__ngram_range = (1, 4) remove_tone__ngram_range = (1, 4) count__max_features = 4000 estimator__C = 0.75 start = time.time() print(">>> Train UTS2017_BANK Classification") corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_TC) print("\n\n>>> Sample sentences") for s in corpus.train[:10]: print(s) pipeline = Pipeline( steps=[ ('features', FeatureUnion([ ('lower_pipe', Pipeline([ ('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=lower__ngram_range, norm='l2', min_df=2, max_features=count__max_features))])), ('with_tone_char', TfidfVectorizer(ngram_range=with_tone__ngram_range, norm='l2', min_df=2, analyzer='char')), ('remove_tone', Pipeline([ ('remove_tone', RemoveTone()), ('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=remove_tone__ngram_range, norm='l2', min_df=2))])), ('emoticons', CountEmoticons())
def test_uts2017_bank_sa(self): corpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA) print(corpus)
def test_vlsp2016_sa(self): corpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA) print(corpus)
def import_corpus(data, input_data_path): DataFetcher.import_corpus(data, input_data_path)
from text_features import Lowercase, RemoveTone, CountEmoticons model_folder = "tmp/sentiment_svm_vlsp2016" try: shutil.rmtree(model_folder) except: pass finally: os.makedirs(model_folder) estimator_C = 0.375 lower_tfidf__ngram_range = (1, 3) with_tone_char__ngram_range = (1, 5) remove_tone__tfidf__ngram_range = (1, 2) print(">>> Train VLSP2016 Sentiment Analysis") corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA) print("\n\n>>> Sample sentences") for s in corpus.train[:10]: print(s) pipeline = Pipeline(steps=[ ('features', FeatureUnion([( 'lower_tfidf', Pipeline([('lower', Lowercase()), ('tfidf', TfidfVectorizer(ngram_range=lower_tfidf__ngram_range, norm='l2', min_df=2))])), ('with_tone_char', TfidfVectorizer(ngram_range=with_tone_char__ngram_range, norm='l2',
def test_import_corpus(self): input_data_path = join(dirname(dirname(__file__)), "languageflow", "data", "vlsp2016_sa_raw_sample") DataFetcher.import_corpus("VLSP2016_SA", input_data_path)
from tempfile import mkdtemp from languageflow.data import CategorizedCorpus from languageflow.data_fetcher import DataFetcher, NLPData from languageflow.models.text_classifier import TextClassifier, TEXT_CLASSIFIER_ESTIMATOR from languageflow.trainers.model_trainer import ModelTrainer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import f1_score from sklearn.pipeline import Pipeline from sklearn.svm import SVC corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA) pipeline = Pipeline( steps=[('features', CountVectorizer(ngram_range=( 1, 2), max_features=4000)), ('estimator', SVC(kernel='linear', C=0.3))]) classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline) model_trainer = ModelTrainer(classifier, corpus) tmp_model_folder = mkdtemp() def negative_f1_score(y_true, y_pred): score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None) return score_class_1 def macro_f1_score(y_true, y_pred): return f1_score(y_true, y_pred, average='macro')
def test_vntc(self): corpus = DataFetcher.load_corpus(NLPData.VNTC) print(corpus)