コード例 #1
0
ファイル: test_svc.py プロジェクト: quang-ph/languageflow
    def test(self):
        corpus: CategorizedCorpus = DataFetcher.load_corpus(
            NLPData.AIVIVN2019_SA_SAMPLE)
        # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
        params = {
            "vectorizer": CountVectorizer(ngram_range=(1, 2),
                                          max_features=4000),
            "svc": SVC(kernel='linear', C=0.3)
        }
        classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.SVC,
                                    **params)
        model_trainer = ModelTrainer(classifier, corpus)
        tmp_model_folder = mkdtemp()

        def negative_f1_score(y_true, y_pred):
            score_class_0, score_class_1 = f1_score(y_true,
                                                    y_pred,
                                                    average=None)
            return score_class_1

        def macro_f1_score(y_true, y_pred):
            return f1_score(y_true, y_pred, average='macro')

        score = model_trainer.train(tmp_model_folder,
                                    scoring=negative_f1_score)
        print(score)

        classifier = TextClassifier.load(tmp_model_folder)
        sentence = Sentence('tuyệt vời')
        classifier.predict(sentence)
        shutil.rmtree(tmp_model_folder)
        print(sentence)
コード例 #2
0
def my_run(estimator__C, features__lower_pipe__tfidf__ngram_range,
           features__with_tone_char__ngram_range,
           features__remove_tone__tfidf__ngram_range):
    params = locals().copy()
    start = time.time()
    print(params)
    corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA)
    pipeline = Pipeline(
        steps=[(
            'features',
            FeatureUnion([(
                'lower_pipe',
                Pipeline([('lower', Lowercase()),
                          ('tfidf',
                           TfidfVectorizer(
                               ngram_range=(1, 4), norm='l2', min_df=2))])),
                          ('with_tone_char',
                           TfidfVectorizer(ngram_range=(1, 6),
                                           norm='l2',
                                           min_df=2,
                                           analyzer='char')),
                          ('remove_tone',
                           Pipeline([('remove_tone',
                                      RemoveTone()), ('lower', Lowercase()),
                                     ('tfidf',
                                      TfidfVectorizer(ngram_range=(1, 4),
                                                      norm='l2',
                                                      min_df=2))])
                           ), ('emoticons', CountEmoticons())])),
               ('estimator',
                SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True)
                )])
    pipeline.set_params(**params)
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = mkdtemp()

    def negative_f1_score(y_true, y_pred):
        score_class_0, score_class_1, score_class_2 = f1_score(y_true,
                                                               y_pred,
                                                               average=None)
        return score_class_1

    def macro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')

    score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score)
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    print(time.time() - start)
    return score['dev_score']
コード例 #3
0
def my_run(estimator__C, features__lower_pipe__tfidf__max_features,
           features__lower_pipe__tfidf__ngram_range,
           features__with_tone_char__ngram_range,
           features__remove_tone__tfidf__ngram_range):
    params = locals().copy()
    start = time.time()
    print(params)
    corpus: CategorizedCorpus = DataFetcher.load_corpus(
        NLPData.UTS2017_BANK_TC)
    pipeline = Pipeline(steps=[(
        'features',
        FeatureUnion([('lower_pipe',
                       Pipeline([('lower', Lowercase()),
                                 ('tfidf',
                                  TfidfVectorizer(norm='l2', min_df=2))])),
                      ('with_tone_char',
                       TfidfVectorizer(norm='l2', min_df=2, analyzer='char')),
                      ('remove_tone',
                       Pipeline([('remove_tone',
                                  RemoveTone()), ('lower', Lowercase()),
                                 ('tfidf', TfidfVectorizer(norm='l2', min_df=2)
                                  )])), ('emoticons', CountEmoticons())])
    ), ('estimator', SVC(kernel='linear', class_weight=None, verbose=True))])
    pipeline.set_params(**params)
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = "tmp/tmp_model"

    def micro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='micro')

    score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score)
    tmp_files = listdir(tmp_model_folder)
    for file in tmp_files:
        if "gitignore" in file:
            continue
        os.remove(f"{tmp_model_folder}/{file}")
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    print(f"Time: {round(time.time() - start, 2)} s")
    return score['dev_score']
コード例 #4
0
def run(estimator, features):
    corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
    pipeline = Pipeline(
        steps=[(
            'features',
            FeatureUnion([(
                'lower_tfidf',
                Pipeline([('lower', Lowercase()),
                          ('tfidf',
                           TfidfVectorizer(
                               ngram_range=(1, 4), norm='l2', min_df=2))])),
                          ('with_tone_char',
                           TfidfVectorizer(ngram_range=(1, 6),
                                           norm='l2',
                                           min_df=2,
                                           analyzer='char')),
                          ('remove_tone',
                           Pipeline([('remove_tone',
                                      RemoveTone()), ('lower', Lowercase()),
                                     ('tfidf',
                                      TfidfVectorizer(ngram_range=(1, 4),
                                                      norm='l2',
                                                      min_df=2))]))])),
               ('estimator',
                SVC(kernel='linear', C=0.2175, class_weight=None, verbose=True)
                )])
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = mkdtemp()

    def negative_f1_score(y_true, y_pred):
        score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None)
        return score_class_1

    def macro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro')

    score = model_trainer.train(tmp_model_folder, scoring=negative_f1_score)
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    return score['test_score']
コード例 #5
0
    def test_fasttext(self):
        corpus: CategorizedCorpus = DataFetcher.load_corpus(
            NLPData.AIVIVN2019_SA_SAMPLE)
        params = {"lr": 0.01, "epoch": 20, "wordNgrams": 3, "dim": 20}
        classifier = TextClassifier(
            estimator=TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT, **params)
        model_trainer = ModelTrainer(classifier, corpus)
        tmp_model_folder = mkdtemp()

        def macro_f1_score(y_true, y_pred):
            return f1_score(y_true, y_pred, average='macro')

        score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score)
        print(score)

        classifier = TextClassifier.load(tmp_model_folder)
        sentence = Sentence('tuyệt vời')
        classifier.predict(sentence)
        shutil.rmtree(tmp_model_folder)
        print(sentence)
コード例 #6
0
def my_run(features__max_df, features__ngram_range):
    params = locals().copy()
    start = time.time()
    print(params)
    from languageflow.data_fetcher import DataFetcher
    corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VNTC)
    pipeline = Pipeline(steps=[('features',
                                TfidfVectorizer()), ('estimator',
                                                     LinearSVC())])
    pipeline.set_params(**params)
    classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                                pipeline=pipeline)
    model_trainer = ModelTrainer(classifier, corpus)
    tmp_model_folder = mkdtemp()

    def micro_f1_score(y_true, y_pred):
        return f1_score(y_true, y_pred, average='micro')

    score = model_trainer.train(tmp_model_folder, scoring=micro_f1_score)
    ex.log_scalar('dev_score', score['dev_score'])
    ex.log_scalar('test_score', score['test_score'])
    print(time.time() - start)
    return score['dev_score']
コード例 #7
0
    def test(self):
        corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA_SAMPLE)
        # corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
        pipeline = Pipeline(
            steps=[('features', CountVectorizer(ngram_range=(1, 2), max_features=4000)),
                   ('estimator', OneVsRestClassifier(SVC(kernel='linear', C=0.3)))]
        )
        classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE, pipeline=pipeline, multilabel=True)
        model_trainer = ModelTrainer(classifier, corpus)
        tmp_model_folder = mkdtemp()

        def macro_f1_score(y_true, y_pred):
            return f1_score(y_true, y_pred, average='macro')

        score = model_trainer.train(tmp_model_folder, scoring=macro_f1_score)
        print(score)

        classifier = TextClassifier.load(tmp_model_folder)

        sentence = Sentence('Dịch vụ tiện dụng quá')
        classifier.predict(sentence)
        print(sentence)

        shutil.rmtree(tmp_model_folder)
コード例 #8
0
ファイル: cli.py プロジェクト: hogwart120/lbminh-bot-test
def remove(data):
    DataFetcher.remove(data)
コード例 #9
0
ファイル: cli.py プロジェクト: hogwart120/lbminh-bot-test
def list(all):
    DataFetcher.list(all)
コード例 #10
0
ファイル: cli.py プロジェクト: hogwart120/lbminh-bot-test
def download(dataset, url):
    DataFetcher.download_data(dataset, url)
コード例 #11
0
from sklearn.svm import LinearSVC

model_folder = "tmp/classification_svm_vntc"
try:
    shutil.rmtree(model_folder)
except:
    pass
finally:
    os.makedirs(model_folder)

tfidf__ngram_range = (1, 2)
tfidf__max_df = 0.5

start = time.time()
print(">>> Train VNTC Classification")
corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VNTC)
print("\n\n>>> Sample sentences")
for s in corpus.train[:10]:
    print(s)
pipeline = Pipeline(
    steps=[('features',
            TfidfVectorizer(ngram_range=tfidf__ngram_range,
                            max_df=tfidf__max_df)), ('estimator',
                                                     LinearSVC())])
print("\n\n>>> Start training")
classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                            pipeline=pipeline)
model_trainer = ModelTrainer(classifier, corpus)


def micro_f1_score(y_true, y_pred):
コード例 #12
0
from languageflow.data import Sentence
from languageflow.data_fetcher import NLPData, DataFetcher
from languageflow.models.text_classifier import TextClassifier

model_folder = "tmp/sentiment_svm_uts2017_bank_sa"
print(f"Load model from {model_folder}")
classifier = TextClassifier.load(model_folder)
print(f"Model is loaded.")


def predict(text):
    print(f"\nText: {text}")

    sentence = Sentence(text)
    classifier.predict(sentence)
    labels = sentence.labels
    print(f"Labels: {labels}")


corpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA)

predict(
    'Bạn nên làm thẻ credit, đừng làm debit. Mình dùng thẻ debit của vcb, tết vừa rồi bị hack mất 28 triệu trong tài khoản mà đến giờ vcb đã giải quyết cho mình đâu. Bực mình!'
)
コード例 #13
0
try:
    shutil.rmtree(model_folder)
except:
    pass
finally:
    os.makedirs(model_folder)

lower__ngram_range = (1, 3)
with_tone__ngram_range = (1, 4)
remove_tone__ngram_range = (1, 4)
count__max_features = 4000
estimator__C = 0.75

start = time.time()
print(">>> Train UTS2017_BANK Classification")
corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_TC)
print("\n\n>>> Sample sentences")
for s in corpus.train[:10]:
    print(s)
pipeline = Pipeline(
    steps=[
        ('features', FeatureUnion([
            ('lower_pipe', Pipeline([
                ('lower', Lowercase()),
                ('tfidf', TfidfVectorizer(ngram_range=lower__ngram_range, norm='l2', min_df=2, max_features=count__max_features))])),
            ('with_tone_char', TfidfVectorizer(ngram_range=with_tone__ngram_range, norm='l2', min_df=2, analyzer='char')),
            ('remove_tone', Pipeline([
                ('remove_tone', RemoveTone()),
                ('lower', Lowercase()),
                ('tfidf', TfidfVectorizer(ngram_range=remove_tone__ngram_range, norm='l2', min_df=2))])),
            ('emoticons', CountEmoticons())
コード例 #14
0
 def test_uts2017_bank_sa(self):
     corpus = DataFetcher.load_corpus(NLPData.UTS2017_BANK_SA)
     print(corpus)
コード例 #15
0
 def test_vlsp2016_sa(self):
     corpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA)
     print(corpus)
コード例 #16
0
ファイル: cli.py プロジェクト: hogwart120/lbminh-bot-test
def import_corpus(data, input_data_path):
    DataFetcher.import_corpus(data, input_data_path)
コード例 #17
0
from text_features import Lowercase, RemoveTone, CountEmoticons

model_folder = "tmp/sentiment_svm_vlsp2016"
try:
    shutil.rmtree(model_folder)
except:
    pass
finally:
    os.makedirs(model_folder)
estimator_C = 0.375
lower_tfidf__ngram_range = (1, 3)
with_tone_char__ngram_range = (1, 5)
remove_tone__tfidf__ngram_range = (1, 2)

print(">>> Train VLSP2016 Sentiment Analysis")
corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.VLSP2016_SA)
print("\n\n>>> Sample sentences")
for s in corpus.train[:10]:
    print(s)
pipeline = Pipeline(steps=[
    ('features',
     FeatureUnion([(
         'lower_tfidf',
         Pipeline([('lower', Lowercase()),
                   ('tfidf',
                    TfidfVectorizer(ngram_range=lower_tfidf__ngram_range,
                                    norm='l2',
                                    min_df=2))])),
                   ('with_tone_char',
                    TfidfVectorizer(ngram_range=with_tone_char__ngram_range,
                                    norm='l2',
コード例 #18
0
 def test_import_corpus(self):
     input_data_path = join(dirname(dirname(__file__)), "languageflow", "data", "vlsp2016_sa_raw_sample")
     DataFetcher.import_corpus("VLSP2016_SA", input_data_path)
コード例 #19
0
ファイル: aivivn2019.py プロジェクト: quang-ph/sentiment
from tempfile import mkdtemp

from languageflow.data import CategorizedCorpus
from languageflow.data_fetcher import DataFetcher, NLPData
from languageflow.models.text_classifier import TextClassifier, TEXT_CLASSIFIER_ESTIMATOR
from languageflow.trainers.model_trainer import ModelTrainer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

corpus: CategorizedCorpus = DataFetcher.load_corpus(NLPData.AIVIVN2019_SA)
pipeline = Pipeline(
    steps=[('features', CountVectorizer(ngram_range=(
        1, 2), max_features=4000)), ('estimator',
                                     SVC(kernel='linear', C=0.3))])
classifier = TextClassifier(estimator=TEXT_CLASSIFIER_ESTIMATOR.PIPELINE,
                            pipeline=pipeline)
model_trainer = ModelTrainer(classifier, corpus)
tmp_model_folder = mkdtemp()


def negative_f1_score(y_true, y_pred):
    score_class_0, score_class_1 = f1_score(y_true, y_pred, average=None)
    return score_class_1


def macro_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

コード例 #20
0
 def test_vntc(self):
     corpus = DataFetcher.load_corpus(NLPData.VNTC)
     print(corpus)