def build(save=True): dataset = data.prepare() descriptions = dataset['CleanDescription'].tolist() salaries = dataset['SalaryNormalized'].tolist() vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3), max_features=24000000) features = vectorizer.fit_transform(descriptions) Xtrain, Xval, ytrain, yval = train_test_split(features, salaries, test_size=0.3) rr = linear_model.Ridge(alpha=0.035) rr.fit(Xtrain, ytrain) r2 = r2_score(yval, rr.predict(Xval)) print(f'Ridge Regression R2 = {r2}') if save: app_dir = Path(__file__).resolve().parents[1] joblib.dump([rr, vectorizer], join(app_dir, 'naive_model.joblib')) return rr, vectorizer
def predict(): global global_graph # Based on # https://blog.keras.io/building-a-simple-keras-deep-learning-rest-api.html # TODO: Handle errors gracefully text = flask.request.args.get('text', type=str) if text is None: error = "Missing required parameter 'text'" success = False else: try: features = data.prepare(text, word_vectors, num_words) except data.InvalidTextError as e: error = str(e) success = False else: # Turn into a batch of one for prediction with global_graph.as_default(): prediction = model.predict(features[np.newaxis, :], batch_size=1) sentiment = ('negative', 'positive')[int(prediction.round())] positivity_score = float(prediction) words = word_vectors.words[features[features != 0]].tolist() success = True allowed = set( 'text error success sentiment positivity_score words'.split()) return flask.jsonify({k: v for k, v in locals().items() if k in allowed})
def test_prepare(word_vectors): with pytest.raises(data.InvalidTextError): data.prepare('', word_vectors, num_words=3) # Get the indices of some common words the = word_vectors.indices['the'] of = word_vectors.indices['of'] pad = word_vectors.indices[None] # Zeroes are used for padding assert pad == 0 # Are we pre-padding correctly? indices = data.prepare('the of', word_vectors, num_words=3) assert all(indices == (pad, the, of)) # pre-truncating? indices = data.prepare('the the of of', word_vectors, num_words=3) assert all(indices == (the, of, of))
def prepare_data_and_watchlists(context, bar_data): sn = context.sn log.info("prepare, sn {}".format(sn)) data = context.data wls = context.watchlists symbols = context.symbols wls.wl_set_sn(sn) data.prepare(sn, bar_data) ######### prepare factors ################## if sn > 5: # if using history data, leave some blank at beginning log.info("prepare, sma5, start") W0 = wls.wl_get_until("W0", 0) sma5 = data.history(W0, 'close', bar_count=5, frequency="1m").mean() data.get_factor("sma5").set(W0, sma5) log.info("prepare, sma5, done") pass
def predict(params): """ From a set of parameters, loads a network (model and weights), builds a prediction vector, which is returned together with the number of tendency errors found """ raw = data.read(params, params['pred_dataset']) normalized = data.normalize(raw, params) adjusted = parameters.adjust(normalized, params) # prepare test data _, _, X_test, Y_test = data.prepare(adjusted, params) # Perform the prediction. model1 = model.prediction_setup(params) print('Feeding X_test (shape=', X_test.shape, ')') (yhat, rmse, num_errors) = range_predict(model1, X_test, Y_test, params) return (params, model1, Y_test, yhat, rmse, num_errors)
def find_similar_faq(text, dataset, dataset_raw, sim_func, top=5): query = prepare(text) sims = [] for q in dataset: s = sim_func(q, query) s = np.mean(s) sims.append(s) indices = np.array(sims).argsort()[::-1][:top] res = dataset_raw.loc[indices][['title', 'text']] result = [] for _, row in res.iterrows(): result.append({ 'title': row['title'], 'text': row['text'] }) return result
%matplotlib inline %load_ext autoreload %autoreload 2 # Initialization of seeds set_random_seed(2) seed(2) # load json and create model params = parameters.read() raw = data.read(params) print('Original dataset num samples:', raw.shape) adjusted = parameters.adjust(raw, params) X_train, Y_train, X_test, Y_test = data.prepare(adjusted, params) # Build the model and train it. params['lstm_batch_size'] = 1 model = lstm.build(params) # load weights into new model model.load_weights("20180116_0438.h5") print("Loaded weights from disk") print('Actual:', params['y_scaler'].inverse_transform(Y_test[31])) # Plot the test values for Y, and Y_hat, without scaling (inverted) Y_hat = model.predict(X_test[31].reshape((1, 6, 8)), batch_size=params['lstm_batch_size']) print('Prediction:', params['y_scaler'].inverse_transform(Y_hat))
params['lstm_timesteps'] = hpv3 for hpv4 in hyperparams['lstm_layer1']: params['lstm_layer1'] = hpv4 for hpv5 in hyperparams['lstm_dropout1']: params['lstm_dropout1'] = hpv5 for hpv6 in hyperparams['lstm_stateful']: params['lstm_stateful'] = hpv6 for hpv7 in hyperparams['lstm_shuffle']: params['lstm_shuffle'] = hpv7 for hpv8 in hyperparams['lstm_forget_bias']: params['lstm_forget_bias'] = hpv8 # # s e t u p # adjusted = parameters.adjust(raw, params) X, Y, Xtest, ytest = prepare( normalize(adjusted, params), params) # # t r a i n i n g # model = setup(params) parameters.summary(params) model.summary() lstm.stateless_fit(model, X, Y, Xtest, ytest, params) # # r e b u i l d & p r e d i c t # pred = lstm.build(params, batch_size=1) pred.set_weights(model.get_weights()) (yhat, rmse, num_errors) = lstm.range_predict( pred, Xtest, ytest, params)
import data as dt import suggest as sg # Test #1 print("1. Loading data: ") a = dt.load() if a is None: print("fail") # Test 2 print("2. Preparing data: ") b = dt.prepare(a) if b is None: print("fail") else: assert (b['Julia']['Ilhabela/SP'] == 1.0), "fail, value not equal" # Test 3 assert sg.psimilar(9) == 0.1, "fail, percentual" assert sg.euclidian([4, 8], [1, 4]) == 5, "fail, distance euclidian" # Test 4 assert round(sg.similar(b, 'Evelyn', 'Antonio'), 2) == 6.93, "fail, distanct between users" # Test 5 t5 = sg.getsimilaruser(b, 'Evelyn') assert round(t5['Antonio'], 4) == 0.1261, "fail, users similars"
import plot %load_ext autoreload %autoreload 2 set_random_seed(2) seed(2) # # s e t u p # raw, params = parameters.initialize() normalized = data.normalize(raw, params) parameters.summary(params) X, Y, Xtest, ytest = data.prepare(normalized, params) # # t r a i n i n g # model = model.setup(params) model.summary() lstm.stateless_fit(model, X, Y, Xtest, ytest, params) # model.save(model, params, prefix='5y', additional_epocs=0) # # r e b u i l d & p r e d i c t # pred = lstm.build(params, batch_size=1) pred.set_weights(model.get_weights()) (yhat, rmse, num_errors) = lstm.range_predict(pred, Xtest, ytest, params)
# Import libraries import keras import numpy as np import math from keras.models import Sequential from keras.utils import to_categorical from keras.layers import Dense, Flatten, Conv3D, MaxPooling3D from keras.layers import Dropout, BatchNormalization from keras.losses import categorical_crossentropy from data import prepare import matplotlib.pyplot as plt # Define pre-proccessed data covid_scans = prepare("covid") non_covid_scans = prepare("non_covid") # Assign labels to differentiate between covid(1) and non-covid(0) scans covid_labels = np.array([1 for _ in range(len(covid_scans))]) non_covid_labels = np.array([0 for _ in range(len(non_covid_scans))]) # Split data: 80% train and 20% Test x_scale = math.floor(len(covid_labels) * .8) y_scale = math.floor(len(non_covid_labels) * .8) # Join covid and non_covid data, split data x_train = np.concatenate((covid_scans[:x_scale], non_covid_scans[:y_scale])) y_train = np.concatenate((covid_labels[:x_scale], non_covid_labels[:y_scale])) x_val = np.concatenate((covid_scans[x_scale:], non_covid_scans[y_scale:])) y_val = np.concatenate((covid_labels[x_scale:], non_covid_labels[y_scale:])) # visualizing one CT scan