Ejemplo n.º 1
0
from os.path import dirname, join
from sklearn.feature_extraction.text import TfidfVectorizer
from load_data import load_dataset
from model import SVCModel

data_train = join(dirname(dirname(dirname(__file__))), "data", "vlsp2018", "corpus", "restaurant", "train.xlsx")
data_dev = join(dirname(dirname(dirname(__file__))), "data", "vlsp2018", "corpus", "restaurant", "dev.xlsx")

X_train, y_train = load_dataset(data_train)
X_dev, y_dev = load_dataset(data_dev)

model = SVCModel("Tfidf Trigram", TfidfVectorizer(ngram_range=(1, 3)))
model.load_data(X_train, y_train)
model.fit_transform()
model.train()
model.evaluate(X_dev, y_dev)
model.export(folder="exported/svc")
Ejemplo n.º 2
0
from load_data import load_dataset
from model import SVCModel

data_train = join(dirname(dirname(dirname(__file__))), "data", "vlsp2018",
                  "corpus", "restaurant", "train.xlsx")
data_dev = join(dirname(dirname(dirname(__file__))), "data", "vlsp2018",
                "corpus", "restaurant", "dev.xlsx")

X_train, y_train = load_dataset(data_train)
X_dev, y_dev = load_dataset(data_dev)

X = X_train + X_dev
y = y_train + y_dev

models = [
    SVCModel("Tfidf Bigram", TfidfVectorizer(ngram_range=(1, 2))),
    SVCModel("Tfidf Trigram", TfidfVectorizer(ngram_range=(1, 3))),
    SVCModel("Count Bigram", CountVectorizer(ngram_range=(1, 2))),
    SVCModel("Count Trigram", CountVectorizer(ngram_range=(1, 3)))
]

for n in [2000, 5000, 10000]:
    model = SVCModel("Count Max Feature {}".format(n),
                     CountVectorizer(max_features=n))
    models.append(model)

for n in [2000, 5000, 10000]:
    model = SVCModel("Count Max Feature {}".format(n),
                     TfidfVectorizer(max_features=n))
    models.append(model)
Ejemplo n.º 3
0
from model import SVCModel
import sqlite3
from tensorflow.keras.models import load_model
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#conn = sqlite3.connect('database.db')
#print ("Opened database successfully")

#conn.execute('CREATE TABLE table1 (Input TEXT, Prediction TEXT)')
#print ("Table created successfully")
#conn.close()

app = Flask(__name__)

model = SVCModel()

clf_path = 'C:/Users/jaspreetsingh5/Documents/Fitara_pred_flask/models/final_prediction.pkl'
with open(clf_path, 'rb') as f:
    model.clf = pickle.load(f)

vec_path = 'C:/Users/jaspreetsingh5/Documents/Fitara_pred_flask/models/CountVectorizer.pkl'
with open(vec_path, 'rb') as f:
    model.vectorizer = pickle.load(f)

model.clf1 = load_model(r'C:\Users\jaspreetsingh5\Documents\flask\my_model.h5')

vec_path1 = 'C:/Users/jaspreetsingh5/Documents/Fitara_pred_flask/models/rnntokenizernn.pkl'
with open(vec_path1, 'rb') as f:
    model.vectorizer1 = pickle.load(f)
Ejemplo n.º 4
0
from os.path import dirname, join
from sklearn.feature_extraction.text import TfidfVectorizer
from load_data import load_dataset
from model import SVCModel

data_train = join(dirname(dirname(dirname(__file__))), "data", "vlsp2018",
                  "corpus", "hotel", "train.xlsx")
data_dev = join(dirname(dirname(dirname(__file__))), "data", "vlsp2018",
                "corpus", "hotel", "dev.xlsx")

X_train, y_train = load_dataset(data_train)
X_dev, y_dev = load_dataset(data_dev)

X = X_train + X_dev
y = y_train + y_dev

model = SVCModel("Tfidf Trigram",
                 TfidfVectorizer(ngram_range=(1, 3), max_features=7000))
model.load_data(X, y)
model.fit_transform()
model.train()
model.evaluate(X_dev, y_dev)
model.export(folder="exported/svc_full")
Ejemplo n.º 5
0
            stop_words = 'english'
        if 'lemmatizer' in options:
            lemmatizer = True
    # read data from files
    _train_text_lst, _train_label_lst, _test_text_lst, _test_label_lst = parse_reuters()

    # remove classes with very few occurrences
    topics_selected = topics_with_occurences_gt(threshold=3)
    train_text_lst, train_label_lst, test_text_lst, test_label_lst = remove_minor_classes(topics_selected)

    # label transformer
    label_binarizer = preprocessing.MultiLabelBinarizer(topics_selected)
    label_binarizer.fit(train_label_lst)
    y_train = label_binarizer.transform(train_label_lst)
    y_test = label_binarizer.transform(test_label_lst)

    # model
    model_name = '{}_{}_{}'.format(model_id,
                                   'non-stopwords' if stop_words is None else 'use-stopwords',
                                   'non-lemma' if not lemmatizer else 'use-lemma')
    if model_id == 'NaiveBayes':
        model = NBModel(model_name, stop_words, lemmatizer)
    elif model_id == 'LR':
        model = LRModel(model_name, stop_words, lemmatizer)
    elif model_id == 'SVC':
        model = SVCModel(model_name, stop_words, lemmatizer)
    elif model_id == 'XGBoost':
        model = XGBModel(model_name, stop_words, lemmatizer, max_features=10000)
    else:
        raise ValueError('This model %s is not supported' % model_id)
    model.train_validate_predict(train_text_lst, y_train, test_text_lst, y_test, topics_selected)