Ejemplo n.º 1
0
def build(save=True):
    dataset = data.prepare()
    descriptions = dataset['CleanDescription'].tolist()
    salaries = dataset['SalaryNormalized'].tolist()

    vectorizer = TfidfVectorizer(min_df=1,
                                 ngram_range=(1, 3),
                                 max_features=24000000)
    features = vectorizer.fit_transform(descriptions)

    Xtrain, Xval, ytrain, yval = train_test_split(features,
                                                  salaries,
                                                  test_size=0.3)

    rr = linear_model.Ridge(alpha=0.035)
    rr.fit(Xtrain, ytrain)

    r2 = r2_score(yval, rr.predict(Xval))
    print(f'Ridge Regression R2 = {r2}')

    if save:
        app_dir = Path(__file__).resolve().parents[1]
        joblib.dump([rr, vectorizer], join(app_dir, 'naive_model.joblib'))

    return rr, vectorizer
Ejemplo n.º 2
0
def predict():
    global global_graph
    # Based on
    # https://blog.keras.io/building-a-simple-keras-deep-learning-rest-api.html

    # TODO: Handle errors gracefully
    text = flask.request.args.get('text', type=str)

    if text is None:
        error = "Missing required parameter 'text'"
        success = False
    else:
        try:
            features = data.prepare(text, word_vectors, num_words)
        except data.InvalidTextError as e:
            error = str(e)
            success = False
        else:
            # Turn into a batch of one for prediction
            with global_graph.as_default():
                prediction = model.predict(features[np.newaxis, :],
                                           batch_size=1)
            sentiment = ('negative', 'positive')[int(prediction.round())]
            positivity_score = float(prediction)
            words = word_vectors.words[features[features != 0]].tolist()
            success = True

    allowed = set(
        'text error success sentiment positivity_score words'.split())
    return flask.jsonify({k: v for k, v in locals().items() if k in allowed})
Ejemplo n.º 3
0
def test_prepare(word_vectors):
    with pytest.raises(data.InvalidTextError):
        data.prepare('', word_vectors, num_words=3)

    # Get the indices of some common words
    the = word_vectors.indices['the']
    of = word_vectors.indices['of']
    pad = word_vectors.indices[None]  # Zeroes are used for padding
    assert pad == 0

    # Are we pre-padding correctly?
    indices = data.prepare('the of', word_vectors, num_words=3)
    assert all(indices == (pad, the, of))

    # pre-truncating?
    indices = data.prepare('the the of of', word_vectors, num_words=3)
    assert all(indices == (the, of, of))
Ejemplo n.º 4
0
def prepare_data_and_watchlists(context, bar_data):
    sn = context.sn
    log.info("prepare, sn {}".format(sn))

    data = context.data
    wls = context.watchlists
    symbols = context.symbols

    wls.wl_set_sn(sn)

    data.prepare(sn, bar_data)

    ######### prepare factors ##################
    if sn > 5:  # if using history data, leave some blank at beginning
        log.info("prepare, sma5, start")
        W0 = wls.wl_get_until("W0", 0)
        sma5 = data.history(W0, 'close', bar_count=5, frequency="1m").mean()
        data.get_factor("sma5").set(W0, sma5)
        log.info("prepare, sma5, done")
    pass
Ejemplo n.º 5
0
def predict(params):
    """
    From a set of parameters, loads a network (model and weights), builds a
    prediction vector, which is returned together with the number of tendency
    errors found
    """
    raw = data.read(params, params['pred_dataset'])
    normalized = data.normalize(raw, params)
    adjusted = parameters.adjust(normalized, params)
    # prepare test data
    _, _, X_test, Y_test = data.prepare(adjusted, params)
    # Perform the prediction.
    model1 = model.prediction_setup(params)
    print('Feeding X_test (shape=', X_test.shape, ')')
    (yhat, rmse, num_errors) = range_predict(model1, X_test, Y_test, params)
    return (params, model1, Y_test, yhat, rmse, num_errors)
Ejemplo n.º 6
0
def find_similar_faq(text, dataset, dataset_raw, sim_func, top=5):
    query = prepare(text)
    sims = []
    for q in dataset:
        s = sim_func(q, query)
        s = np.mean(s)
        sims.append(s)

    indices = np.array(sims).argsort()[::-1][:top]
    res = dataset_raw.loc[indices][['title', 'text']]
    result = []
    for _, row in res.iterrows():
        result.append({
            'title': row['title'],
            'text': row['text']
        })

    return result
Ejemplo n.º 7
0

%matplotlib inline
%load_ext autoreload
%autoreload 2

# Initialization of seeds
set_random_seed(2)
seed(2)

# load json and create model
params = parameters.read()
raw = data.read(params)
print('Original dataset num samples:', raw.shape)
adjusted = parameters.adjust(raw, params)
X_train, Y_train, X_test, Y_test = data.prepare(adjusted, params)

# Build the model and train it.
params['lstm_batch_size'] = 1
model = lstm.build(params)

# load weights into new model
model.load_weights("20180116_0438.h5")
print("Loaded weights from disk")

print('Actual:', params['y_scaler'].inverse_transform(Y_test[31]))

# Plot the test values for Y, and Y_hat, without scaling (inverted)
Y_hat = model.predict(X_test[31].reshape((1, 6, 8)), batch_size=params['lstm_batch_size'])
print('Prediction:', params['y_scaler'].inverse_transform(Y_hat))
Ejemplo n.º 8
0
 params['lstm_timesteps'] = hpv3
 for hpv4 in hyperparams['lstm_layer1']:
     params['lstm_layer1'] = hpv4
     for hpv5 in hyperparams['lstm_dropout1']:
         params['lstm_dropout1'] = hpv5
         for hpv6 in hyperparams['lstm_stateful']:
             params['lstm_stateful'] = hpv6
             for hpv7 in hyperparams['lstm_shuffle']:
                 params['lstm_shuffle'] = hpv7
                 for hpv8 in hyperparams['lstm_forget_bias']:
                     params['lstm_forget_bias'] = hpv8
                     #
                     # s e t u p
                     #
                     adjusted = parameters.adjust(raw, params)
                     X, Y, Xtest, ytest = prepare(
                         normalize(adjusted, params), params)
                     #
                     # t r a i n i n g
                     #
                     model = setup(params)
                     parameters.summary(params)
                     model.summary()
                     lstm.stateless_fit(model, X, Y, Xtest, ytest,
                                        params)
                     #
                     # r e b u i l d   &   p r e d i c t
                     #
                     pred = lstm.build(params, batch_size=1)
                     pred.set_weights(model.get_weights())
                     (yhat, rmse, num_errors) = lstm.range_predict(
                         pred, Xtest, ytest, params)
Ejemplo n.º 9
0
import data as dt
import suggest as sg

# Test #1
print("1. Loading data: ")
a = dt.load()

if a is None:
    print("fail")

# Test 2
print("2. Preparing data: ")
b = dt.prepare(a)

if b is None:
    print("fail")
else:
    assert (b['Julia']['Ilhabela/SP'] == 1.0), "fail, value not equal"

# Test 3
assert sg.psimilar(9) == 0.1, "fail, percentual"
assert sg.euclidian([4, 8], [1, 4]) == 5, "fail, distance euclidian"

# Test 4
assert round(sg.similar(b, 'Evelyn', 'Antonio'),
             2) == 6.93, "fail, distanct between users"

# Test 5
t5 = sg.getsimilaruser(b, 'Evelyn')
assert round(t5['Antonio'], 4) == 0.1261, "fail, users similars"
Ejemplo n.º 10
0
import plot


%load_ext autoreload
%autoreload 2

set_random_seed(2)
seed(2)

#
# s e t u p
#
raw, params = parameters.initialize()
normalized = data.normalize(raw, params)
parameters.summary(params)
X, Y, Xtest, ytest = data.prepare(normalized, params)

#
# t r a i n i n g
#
model = model.setup(params)
model.summary()
lstm.stateless_fit(model, X, Y, Xtest, ytest, params)
# model.save(model, params, prefix='5y', additional_epocs=0)

#
# r e b u i l d   &   p r e d i c t
#
pred = lstm.build(params, batch_size=1)
pred.set_weights(model.get_weights())
(yhat, rmse, num_errors) = lstm.range_predict(pred, Xtest, ytest, params)
Ejemplo n.º 11
0
# Import libraries
import keras
import numpy as np
import math
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Flatten, Conv3D, MaxPooling3D
from keras.layers import Dropout, BatchNormalization
from keras.losses import categorical_crossentropy
from data import prepare
import matplotlib.pyplot as plt

# Define pre-proccessed data
covid_scans = prepare("covid")
non_covid_scans = prepare("non_covid")

# Assign labels to differentiate between covid(1) and non-covid(0) scans
covid_labels = np.array([1 for _ in range(len(covid_scans))])
non_covid_labels = np.array([0 for _ in range(len(non_covid_scans))])

# Split data: 80% train and 20% Test
x_scale = math.floor(len(covid_labels) * .8)
y_scale = math.floor(len(non_covid_labels) * .8)

# Join covid and non_covid data, split data
x_train = np.concatenate((covid_scans[:x_scale], non_covid_scans[:y_scale]))
y_train = np.concatenate((covid_labels[:x_scale], non_covid_labels[:y_scale]))
x_val = np.concatenate((covid_scans[x_scale:], non_covid_scans[y_scale:]))
y_val = np.concatenate((covid_labels[x_scale:], non_covid_labels[y_scale:]))

# visualizing one CT scan