def test_bigru(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='standard', maxlen=350, max_features=35000, ngram_range=1) model = txt.text_classifier('bigru', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32, eval_batch_size=EVAL_BS) lr = 0.01 hist = learner.autofit(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89) self.assertAlmostEqual(max(hist.history['momentum']), 0.95) self.assertAlmostEqual(min(hist.history['momentum']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( "./text_data/chinese_hotel_reviews.csv", "content", label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode="standard", sep="|", ) model = txt.text_classifier("fasttext", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate(class_names=preproc.get_classes()) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], "pos") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(TEST_DOC), "pos") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def loadmodel(self, path=None): with self.graph.as_default(): with self.session.as_default(): try: if path is not None: # load the model self.model = ktrain.load_predictor(path) logging.info("Bert predictor loaded: ") return True except Exception as e: print(e) logging.exception(e) return False
def test_bert(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="bert", maxlen=350, max_features=35000, ) model = txt.text_classifier("bert", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, batch_size=6, eval_batch_size=EVAL_BS) lr = 2e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[ACC_NAME]), 0.7) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(len(val[0][0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_cora(self): (trn, val, preproc) = gr.graph_links_from_csv('graph_data/cora/cora.content', 'graph_data/cora/cora.cites', sep='\t') learner = ktrain.get_learner(model=gr.graph_link_predictor( 'graphsage', trn, preproc), train_data=trn, val_data=val) lr = 0.01 hist = learner.fit_onecycle(lr, 5) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.78) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.targets.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate learner.validate(val_data=val) cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertIn( p.predict(preproc.G, list(preproc.G.edges()))[:5][0], preproc.get_classes()) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual( p.predict(preproc.G, list(preproc.G.edges()))[:5][0], preproc.get_classes()[1])
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( './text_data/chinese_hotel_reviews.csv', 'content', label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode='standard', sep='|') model = txt.text_classifier('fasttext', train_data=trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 2) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'pos') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'pos') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_classification(self): train_df = pd.read_csv("tabular_data/train.csv", index_col=0) train_df = train_df.drop("Name", 1) train_df = train_df.drop("Ticket", 1) trn, val, preproc = tabular.tabular_from_df(train_df, label_columns="Survived", random_state=42) model = tabular.tabular_classifier("mlp", trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 0.001 hist = learner.fit_onecycle(lr, 30) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.8) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.df.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.evaluate(val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) predicted_label = p.predict(train_df)[0] self.assertIn(predicted_label, preproc.get_classes()) p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(train_df)[0], predicted_label)
def test_transformers_api_2(self): MODEL_NAME = 'distilbert-base-uncased' preproc = txt.Transformer(MODEL_NAME, maxlen=500, classes=self.classes) trn = preproc.preprocess_train(self.trn[0], self.trn[1]) val = preproc.preprocess_test(self.val[0], self.val[1]) model = preproc.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS) lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_regression(self): trn, val, preproc = tabular.tabular_from_csv( "tabular_data/adults.csv", label_columns=["age"], is_regression=True, random_state=42, ) model = tabular.tabular_regression_model("mlp", trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128) lr = 0.001 hist = learner.autofit(lr, 5) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertLess(min(hist.history["val_mae"]), 8.0) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.df.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.evaluate(val) # test predictor p = ktrain.get_predictor(learner.model, preproc) train_df = pd.read_csv("tabular_data/adults.csv") age = p.predict(train_df)[0][0] self.assertLess(age, 100) p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertAlmostEqual(p.predict(train_df)[0][0], age)
def model_pred(self, message): print("loading model") reloaded_predictor = ktrain.load_predictor( 'model/distilbert_model_40Epochs') print("predicting..") result = reloaded_predictor.predict(message) results = [result, 'ticket_gen'] predicts = reloaded_predictor.predict_proba(message) print("prediction done") # results = self.decode(le,predicts) sp = np.max(predicts) cs = round(sp, 6) * 100 print('Confidence Score : ' + str(cs) + "%") response = [cs, results] return response
def test_linreg(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], preprocess_mode='standard', ngram_range=3, maxlen=200, max_features=35000) model = txt.text_regression_model('linreg', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=256) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertLess(min(hist.history['val_mae']), 12) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertGreater(p.predict([TEST_DOC])[0], 100) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertGreater(p.predict([TEST_DOC])[0], 100) self.assertIsNone(p.explain(TEST_DOC))
def test_distilbert(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], preprocess_mode='distilbert', maxlen=75) model = txt.text_regression_model('distilbert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=100) lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertLess(min(hist.history['val_mae']), 16) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder, preproc=preproc) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=64) self.assertGreater(p.predict([TEST_DOC])[0], 1) tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=64) self.assertGreater(p.predict([TEST_DOC])[0], 1) self.assertIsNone(p.explain(TEST_DOC))
def test_ner(self): model = txt.sequence_tagger('bilstm-bert', self.preproc, bert_model='bert-base-cased') learner = ktrain.get_learner(model, train_data=self.trn, val_data=self.val, batch_size=128) lr = 0.01 hist = learner.fit(lr, 1) # test training results #self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(learner.validate(), 0.79) # test top losses obs = learner.top_losses(n=1) self.assertIn(obs[0][0], list(range(len(self.val.x)))) learner.view_top_losses(n=1) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test predictor SENT = 'There is a man named John Smith.' p = ktrain.get_predictor(learner.model, self.preproc) self.assertEqual(p.predict(SENT)[-2][1], 'I-PER') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(SENT)[-2][1], 'I-PER') merged_prediction = p.predict(SENT, merge_tokens=True, return_offsets=True) self.assertEqual(merged_prediction[0][0], 'John Smith') self.assertEqual(merged_prediction[0][1], 'PER') self.assertEqual(merged_prediction[0][2], (21, 31))
def load(path): assert os.path.isdir(path), "Path must be a directory to load" params_path = os.path.join(path, 'params.json') with open(params_path, 'r') as f: params = json.load(f) clf = DocumentClassifier(params) if params['clf'] == 'BERT': import ktrain predictor_path = os.path.join(path, 'predictor') clf.predictor = ktrain.load_predictor(predictor_path) else: pipeline_path = os.path.join(path, 'pipeline.pickle') with open(pipeline_path, 'rb') as f: clf.pipeline = pickle.load(f) clf.fitted = True return clf
def test_ner(self): wv_url = ( "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz" ) model = txt.sequence_tagger("bilstm-crf", self.preproc, wv_path_or_url=wv_url) learner = ktrain.get_learner(model, train_data=self.trn, val_data=self.val, batch_size=128) lr = 0.01 hist = learner.fit(lr, 1) # test training results # self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(learner.validate(), 0.65) # test top losses obs = learner.top_losses(n=1) self.assertIn(obs[0][0], list(range(len(self.val.x)))) learner.view_top_losses(n=1) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test predictor SENT = "There is a man named John Smith." p = ktrain.get_predictor(learner.model, self.preproc) self.assertEqual(p.predict(SENT)[-2][1], "I-PER") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(SENT)[-2][1], "I-PER")
# Import Required Modules import requests from bs4 import BeautifulSoup as bs from googlesearch import search from ktrain import load_predictor is_predictor = 1 print("Loading BERT Model...") # Load BERT Model try: predictor = load_predictor('model/bert_model') print("Model Loded Successfully") except: is_precictor = 0 print("Model not found") # [ Function to get Code snippets from StackOverFlow ] def get_stackoverflow_codes(link): res = requests.get(link) # get HTML template soup = bs(res.text, "html.parser") alla = soup.select(".answer") # Function to get codes def get_answers(ans): fin_ans = [] for i in range(len(ans)): user = "" code_section = ans[i].select(".js-post-body")[0] pres = code_section.select("pre") codes = []
from shapely.geometry import Polygon, MultiPolygon, Point import numpy as np import random from geopy.distance import geodesic import plotly.io as pio from urllib.request import urlopen import json import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" model_path = os.getenv( "MODEL_FILEPATH", os.path.join(os.path.dirname(os.path.dirname(__file__)), "model/v2")) import ktrain predictor = ktrain.load_predictor(model_path) pio.renderers.default = "browser" import re import dataset import psycopg2 import os GENERATE_PLOTS = False TABLE = 'articles_v2' # # db_config = { "user": "******", "password": "******", "host": "127.0.0.1", "port": "5432", "database": "cvwire",
# due to hardware limitations import ktrain from ktrain import text import glob (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( 'aclImdb', maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['pos', 'neg']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6) learner.fit_onecycle(2e-5, 2) # train for 2 epochs predictor = ktrain.get_predictor(model, preproc) predictor.save('/models/predictor') predictor = ktrain.load_predictor('/models/predictor') dataset = 'aclImdb/train/unsup' file_list = glob.glob(dataset + "/*.txt") results = open("train_labels.txt", "w") for file in file_list: review_text = open(file, "r", encoding="utf-8").readlines()[0] predict = predictor.predict(review_text) results.write(predict + '\n') results.close()
def __init__(self): self.predictor = ktrain.load_predictor( 'gsa_server/resources/xlnet_6epoch_3e-5') self.t = text.Transformer(MODEL_NAME, maxlen=500, class_names=[0, 1])
""" predictor = ktrain.get_predictor(learner.model, preproc=t) #predictor.predict('F**k you.') # predicted probability scores for each category #predictor.predict_proba('Jesus Christ is the central figure of Christianity.') #predictor.get_classes() """As expected, `soc.religion.christian` is assigned the highest probability. Let's invoke the `explain` method to see which words contribute most to the classification. We will need a forked version of the **eli5** library that supportes TensorFlow Keras, so let's install it first. """ #!pip3 install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1 #predictor.explain('Jesus Christ is the central figure in Christianity.') """The words in the darkest shade of green contribute most to the classification and agree with what you would expect for this example. We can save and reload our predictor for later deployment. """ predictor.save('./distilbert_predictor') reloaded_predictor = ktrain.load_predictor('./distilbert_predictor') print(reloaded_predictor.predict('My computer monitor is really blurry.'))
def loader_distilbert(): return ktrain.load_predictor( './models/distillbert/category_distilbert_predictor')
'what a beautiful movie. great plot. acting was good. will see it again'] predictor.predict(data) #return_proba = True means it will give the prediction probabilty for each class predictor.predict(data, return_proba=True) #classes available predictor.get_classes() # !zip -r /content/bert.zip /content/bert ## Deploy Model # #loading the model predictor_load = ktrain.load_predictor('/content/drive/My Drive/ColabData/bert') # #predicting the data # predictor_load.predict(data) ## References - [`ktrain` module](https://github.com/amaiya/ktrain) - [Sentiment Classification Using Bert](https://kgptalkie.com/sentiment-classification-using-bert/) - [當Bert遇上Keras:這可能是Bert最簡單的打開姿勢](http://www.ipshop.xyz/15376.html) - [進擊的 BERT:NLP 界的巨人之力與遷移學習](https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html)
from django.shortcuts import render, redirect from django.contrib import messages from django.http import JsonResponse import praw import ktrain import os THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) my_file = os.path.join(THIS_FOLDER, 'distilbert_predictor_final') predictor = ktrain.load_predictor(my_file) # Create your views here. def get_submission_from_url(submission_url): reddit = praw.Reddit(client_id='2uReEcmijpNWnw', client_secret='V0PCW7O1S6r3prN6ieRr4LVPGKo', user_agent='test reddit app') submission = None if submission_url.startswith('www'): submission_url = "https://" + submission_url elif submission_url.startswith('reddit'): submission_url = "https://www." + submission_url if submission_url.startswith('https://'): submission = reddit.submission(url=submission_url) return submission def get_data_from_post(submission_url): submission = get_submission_from_url(submission_url) if submission is not None: full_text = submission.title + submission.selftext
from flask import Flask,render_template,url_for,request import pandas as pd import pickle from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB import joblib import ktrain # load the model from disk filename = 'bert' #clf = pickle.load(open(filename, 'rb')) #cv=pickle.load(open('tranform.pkl','rb')) model = ktrain.load_predictor(filename) app = Flask(__name__) @app.route('/') def home(): return render_template('home.html') @app.route('/predict',methods=['POST']) def predict(): # df= pd.read_csv("spam.csv", encoding="latin-1") # df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True) # # Features and Labels # df['label'] = df['class'].map({'ham': 0, 'spam': 1}) # X = df['message'] # y = df['label'] # # # Extract Feature With CountVectorizer # cv = CountVectorizer() # X = cv.fit_transform(X) # Fit the Data
import ktrain import os os.environ["CUDA_VISIBLE_DEVICES"] = '-1' predictor = ktrain.load_predictor('distilbert') def get_prediction(x): sent = predictor.predict([x]) return sent[0]
# Shuffle dataset to get a good variation over the labels by first zipping # the list together, then shuffle and unzip again shuffle_zip = list(zip(text, labels)) np.random.shuffle(shuffle_zip) text_shuffled, labels_shuffled = zip(*shuffle_zip) text_shuffled = list(text_shuffled) labels_shuffled = list(labels_shuffled) # Split the data in 80% for the train and validation set and 20% for the test set test_split = int(0.8 * len(text_shuffled)) test_text = text_shuffled[test_split:] test_labels = labels_shuffled[test_split:] # Load the BERT classifer from memory tf.autograph.set_verbosity(0) classifier = ktrain.load_predictor( os.path.join(model_path, model_name, "Final", "BERT_model")) # Make predictions on the test set test_predicted_prob = classifier.predict_proba(test_text)[:, 1] tf.keras.backend.clear_session( ) # Clear session to prevent memory leak from TF # Determine the final predicted class labels for this model test_predicted = np.where(test_predicted_prob < np.float64(threshold), 0, 1) # Check if results folder exists for this case if not os.path.exists(result_path): os.makedirs(result_path) # Plot the ROC curve fig, ax = plt.subplots()
def load_model_sentiment(model_path): return ktrain.load_predictor(model_path)
import ktrain predictor = ktrain.load_predictor("distilbert") def get_prediction(x): pred = predictor.predict([x]) return pred[0]
def test_folder(self): (trn, val, preproc) = vis.images_from_folder( datadir='image_data/image_folder', data_aug=vis.get_data_aug(horizontal_flip=True), classes=['cat', 'dog'], train_test_names=['train', 'valid']) model = vis.image_classifier('pretrained_resnet50', trn, val) learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=1) learner.freeze() hist = learner.autofit(1e-3, monitor='val_acc') # test train self.assertAlmostEqual(max(hist.history['lr']), 1e-3) if max(hist.history['acc']) == 0.5: raise Exception('unlucky initialization: please run test again') self.assertGreater(max(hist.history['acc']), 0.8) # test top_losses obs = learner.top_losses(n=1, val_data=val) print(obs) if obs: self.assertIn(obs[0][0], list(range(U.nsamples_from_data(val)))) else: self.assertEqual(max(hist.history['val_acc']), 1) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 54) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) r = p.predict_folder('image_data/image_folder/train/') print(r) self.assertEqual(r[0][1], 'cat') r = p.predict_proba_folder('image_data/image_folder/train/') self.assertEqual(np.argmax(r[0][1]), 0) r = p.predict_filename( 'image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(r, ['cat']) r = p.predict_proba_filename( 'image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(np.argmax(r), 0) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') r = p.predict_filename( 'image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(r, ['cat'])
import pandas as pd import ktrain import sys args = sys.argv[1:] path_model = args[0] path_predict = args[1] X_predict = pd.read_csv(path_predict) predictor = ktrain.load_predictor(path_model) print(predictor.predict(list(X_predict['Reviews'])))