コード例 #1
0
def build_lstm(sequence_length, n_words, starting_output_dim, batch_size=100):
    mem_size_GB = ((starting_output_dim * sequence_length * batch_size * 8) /
                   1000000000)
    logger.log(f'Current network will take up {mem_size_GB} GB of memory.')
    if mem_size_GB > config.MAX_MEMORY_GB:
        raise Exception(
            'Network is configured to use more than available RAM. Total size of current network: '
            + str(mem_size_GB) + 'GB')

    classifier = NeuralNetwork('SEQUENTIAL', epochs=40, batch_size=batch_size)
    classifier.add(
        Embedding(input_dim=n_words,
                  output_dim=starting_output_dim,
                  input_length=sequence_length,
                  trainable=True))  # Embed the text sequences
    classifier.add(
        LSTM(units=int(starting_output_dim / 2), return_sequences=True))
    classifier.add(
        LSTM(units=int(starting_output_dim / 4), return_sequences=False))
    classifier.add(Dense(units=1, activation='sigmoid'))

    # Other optimizers: rmsprop, adagrad, adam, adadelta, adamax, nadam, SGD(lr=0.01)
    classifier.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.0),
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

    return classifier
コード例 #2
0
ファイル: junction.py プロジェクト: punker76/angular-bokeh
 def do_it(self, a):
     try:
         inp = json.loads(a)
         is_json = True
     except:
         is_json = False
     logger.log(lvl="INFO", msg="from angular: %s" % a[0:100], orig=self.cn)
     if is_json:
         fct = inp["name"]
         args = inp["args"]
         user = inp["user"]
         if self.sleep and fct not in modalFct:
             return
         try:
             method = getattr(self, fct)
             if inspect.iscoroutinefunction(method):
                 aio.ensure_future(method(*args, user=user))
             else:
                 return method(*args, user=user)
         except:
             exc_type, exc_value, exc_traceback = sys.exc_info()
             formatted_lines = traceback.format_exc().splitlines()
             return (
                 "ERROR no valid json-function call from junction via .. %s \n %s \n %s \n %s"
                 % (fct, exc_type, exc_value, formatted_lines))
コード例 #3
0
 def save(self, file_path):
     try:
         self.model.save(file_path + '.h5')
         logger.log('Model saved.')
         return True
     except Exception as e:
         logger.log_error(e)
         return False
コード例 #4
0
 def load(self, file_path):
     try:
         self.model = load_model(file_path + '.h5')
         logger.log('Model loaded.')
         self.model.summary()
         return True
     except Exception as e:
         logger.log_error(e)
         return False
コード例 #5
0
def train(dataset):
    global is_training
    global vectorizer
    global classifier
    global classifier_ready

    if is_training == True:
        raise TrainingInProgressError()

    logger.log('Starting training...')
    is_training = True
    classifier_ready = False

    text, values = zip(*dataset.data)

    dataset_size = len(text)
    train_size = 0.9 if dataset_size <= config.MAX_DATASET_SIZE else config.MAX_DATASET_SIZE

    X_train, X_discard, y_train, y_discard = train_test_split(
        text, values, train_size=train_size)

    logger.log('Vectorizing the data.')
    vectorizer = CountVectorizer(
        max_features=config.SENTIMENT_BAG_OF_WORDS_SIZE)
    X = vectorizer.fit_transform(X_train).toarray()
    y = y_train

    logger.log('Fitting the model.')
    classifier = GaussianNB()
    classifier.fit(X, y)

    logger.log('Determining accuracy.')
    accuracies = cross_val_score(estimator=classifier,
                                 X=X,
                                 y=y,
                                 cv=10,
                                 pre_dispatch=8)
    avg_accuracy = accuracies.mean() * 100
    std_dev = accuracies.std() * 100

    print('Saving results.')
    model_info = {
        '_id': str(uuid4()),
        'model_type': MODEL_TYPE,
        'has_vectorizor': True,
        'is_current_model': True,
        'acc': avg_accuracy,
        'std_dev': std_dev
    }

    model = Model(model_info, classifier, vectorizer)
    model_service.save_model(model)

    is_training = False
    classifier_ready = True
    logger.log('Training completed.\nResults:\nAverage: ' + avg_accuracy +
               '\nStandard Deviation: ' + std_dev)
コード例 #6
0
 def parse(self, item):
     if debug[-1]:
         logger.log(lvl="INFO",
                    msg="message received: %s" % item,
                    orig=self.cn)
     if item.startswith("send") or item.startswith("client"):
         return super(WebsocketObserver, self).parse(item)
     else:
         out = self.ptv.do_it(item)
         # in case of error
         if type(out) is str:
             if "ERROR" in out:
                 print(out)
         return False, None, None
コード例 #7
0
def clean(data):
    global is_cleaning

    if is_cleaning == True:
        raise CleaningInProgressError()

    if len(data) > config.SENTIMENT_SINGLE_THREAD_CUTOFF:
        logger.log('Cleaning text.')
        is_cleaning = True

        pool = Pool(processes=8)
        clean_data = pool.map(clean_single, data)

        is_cleaning = False
        logger.log('Cleaning complete.')
    else:
        clean_data = map(clean_single, data)

    return list(clean_data)
コード例 #8
0
def clean(data):
    # Clean the text
    if len(data) > config.CLEANER_SINGLE_THREAD_CUTOFF:
        logger.log('Cleaning text.')
        pool = Pool(processes=8)
        clean_data = pool.map(clean_text_single, data)
        logger.log('Cleaning complete.')
    else:
        logger.log('Cleaning text.')
        clean_data = map(clean_text_single, data)
        logger.log('Cleaning complete.')
    #endif

    return list(clean_data)


# clean()
コード例 #9
0
def find_best_params(X, y):
    logger.log('Finding the best parameters')

    # parameters for KNeighborsClassifier
    # parameters = {
    #     'n_neighbors': range(3, 20), # best: 16
    #     'weights': ['uniform', 'distance'], # best: distance
    #     'metric': ['euclidean', 'minkowski', 'manhattan'], # best: minkowski
    #     'p': range(1,5) # best: 1
    # }

    # parameters for RandomForestClassifier
    # parameters = {
    #     'n_estimators': [1, 5, 10, 100, 250, 500], # best: 500
    #     'criterion': ['gini', 'entropy'] # best: entropy
    # }

    parameters = {
        'n_estimators': [500, 750, 1000, 2000],  # best: 500
        'criterion': ['entropy']  # best: entropy
    }

    start = time.time()
    grid_search = GridSearchCV(estimator=RandomForestClassifier(),
                               param_grid=parameters,
                               scoring='accuracy',
                               cv=10,
                               pre_dispatch=8,
                               n_jobs=-1)

    grid_search = grid_search.fit(X, y)
    end = time.time()
    final_time = end - start

    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_
    logger.log(
        'Best accuracy: ' + best_accuracy + '\nBest Parameters: ' +
        best_parameters +
        '\nTraining complete. In order to save model please re-run with the given parameters.'
    )
    logger.log('Debug here')
コード例 #10
0
def train(dataset):
    global is_training
    global tokenizer
    global classifier
    global classifier_ready

    if is_training == True:
        raise TrainingInProgressError()

    logger.log('Starting training...')
    is_training = True
    classifier_ready = False

    X, y = zip(*dataset.data)

    y = np.asarray(y)

    # Note: This should only be used for Embedded models
    logger.log('Encoding the data.')
    tokenizer = Tokenizer(num_words=config.STOCK_V2_BAG_OF_WORDS_SIZE)
    tokenizer.fit_on_texts(X)
    X = np.array([np.array(xi) for xi in tokenizer.texts_to_sequences(X)])
    X = pad_sequences(X,
                      padding='post',
                      value=0,
                      maxlen=config.MAX_SEQUENCE_LENGTH
                      )  # Only ~3% of articles are greater than 500 words
    sequence_length = len(X[0])
    tokenizer.max_length = sequence_length
    n_words = config.STOCK_V2_BAG_OF_WORDS_SIZE

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=config.TRAINING_SET_SIZE)

    logger.log('Fitting the model.')
    start = time.time()

    classifier = build_lstm(sequence_length,
                            n_words,
                            starting_output_dim=3000,
                            batch_size=100)  # acc:

    classifier.fit(X_train, y_train)

    end = time.time()
    final = end - start
    logger.log('Fitting completed in ' + str(final) + 's')

    # find_best_params(X, y)

    logger.log('Determining accuracy.')

    # Acc for ANN:
    avg_accuracy = classifier.evaluate(X_test, y_test)
    avg_accuracy = avg_accuracy * 100
    std_dev = 0

    print('Saving results.')
    model_info = {
        '_id': str(uuid4()),
        'model_type': MODEL_TYPE,
        'is_current_model': True,
        'acc': avg_accuracy,
        'std_dev': std_dev,
        'use_keras_save': True
    }

    model = Model(model_info, classifier, tokenizer=tokenizer)
    model_service.save_model(model)

    is_training = False
    classifier_ready = True
    logger.log('Training completed.\nResults:\nAverage: ' + str(avg_accuracy) +
               '\nStandard Deviation: ' + str(std_dev))

    logger.log('Debugger')
コード例 #11
0
def train(dataset):
    global is_training
    global vectorizer
    global classifier
    global classifier_ready
    global label_encoder
    global one_hot_encoder

    if is_training == True:
        raise TrainingInProgressError()

    logger.log('Starting training...')
    is_training = True
    classifier_ready = False

    text, values = zip(*dataset.data)

    X_train, X_test, y_train, y_test = train_test_split(
        text, values, train_size=config.TRAINING_SET_SIZE)

    logger.log('Encoding the data.')
    label_encoder = LabelEncoder()
    y_train_labeled = label_encoder.fit_transform(y_train)
    y_test_labeled = label_encoder.transform(y_test)

    one_hot_encoder = OneHotEncoder(drop='first',
                                    handle_unknown='error',
                                    categories='auto')
    y_train_encoded = one_hot_encoder.fit_transform(
        y_train_labeled.reshape(-1, 1)).toarray()
    y_test_encoded = one_hot_encoder.transform(y_test_labeled.reshape(
        -1, 1)).toarray()

    logger.log('Vectorizing the data.')
    # vectorizer = CountVectorizer(max_features=config.CATEGORY_BAG_OF_WORDS_SIZE)
    vectorizer = TfidfVectorizer(
        max_features=config.CATEGORY_BAG_OF_WORDS_SIZE)

    X_train = vectorizer.fit_transform(X_train).toarray()
    y_train = y_train_encoded

    num_categories = len(y_train[0])

    X_test = vectorizer.transform(X_test).toarray()
    y_test = y_test_encoded
    # y_train = y_labeled # Note this is only for classifiers that do not support multilabel

    logger.log('Fitting the model.')
    start = time.time()

    # classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 1, weights='distance') # acc: 44%, 25%
    classifier = RandomForestClassifier(n_estimators=100,
                                        criterion='gini',
                                        n_jobs=-1)  # acc: 64%, 96%
    # classifier = GaussianNB() # acc: 30%, 3%

    # ANN classifier acc: 67%, 81%
    # classifier = NeuralNetwork'SEQUENTIAL', 10)
    # classifier.add(Dense(activation='relu', input_dim=config.CATEGORY_BAG_OF_WORDS_SIZE, units=250))
    # classifier.add(Dense(activation='softmax', units=num_categories))

    # Other optimizers: rmsprop, adagrad, adam, adadelta, adamax, nadam, SGD(lr=0.01)
    # optimizer = 'adadelta' # Best: adadelta

    # classifier.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    classifier.fit(X_train, y_train)

    end = time.time()
    final = end - start

    logger.log('Fitting completed in ' + final + 's')

    # logger.log('Finding the best parameters')

    # parameters for KNeighborsClassifier
    # parameters = {
    #     'n_neighbors': range(3, 20), # best: 3
    #     'weights': ['uniform', 'distance'], # best: distance
    #     'metric': ['euclidean', 'minkowski', 'manhattan'], # best: minkowski
    #     'p': range(1,5) # best: 1
    # }

    # parameters for RandomForestClassifier
    # parameters = {
    #     'n_estimators': [100], # best: 100
    #     'criterion': ['gini', 'entropy'] # best: gini
    # }

    # start = time.time()
    # grid_search = GridSearchCV(estimator = RandomForestClassifier(),
    #                         param_grid = parameters,
    #                         scoring = 'accuracy',
    #                         cv = 10,
    #                         pre_dispatch=8,
    #                         n_jobs=-1)

    # grid_search = grid_search.fit(X, y)
    # end = time.time()

    # best_accuracy = grid_search.best_score_
    # best_parameters = grid_search.best_params_
    # logger.log(f'Best accuracy: {best_accuracy}\nBest Parameters: {best_parameters}\nTraining complete. In order to save model please re-run with the given parameters.')

    logger.log('Determining accuracy.')

    # Acc for statistical models
    accuracies = cross_val_score(estimator=classifier,
                                 X=X_train,
                                 y=y_train_labeled,
                                 cv=10,
                                 n_jobs=-1)
    avg_accuracy = accuracies.mean() * 100
    std_dev = accuracies.std() * 100

    # Acc for ANN:
    # avg_accuracy = classifier.evaluate(X_test, y_test)
    # avg_accuracy = avg_accuracy * 100
    # std_dev = 0

    print('Saving results.')
    model_info = {
        '_id': str(uuid4()),
        'model_type': MODEL_TYPE,
        'has_vectorizor': True,
        'has_encoders': True,
        'is_current_model': True,
        'acc': avg_accuracy,
        'std_dev': std_dev,
        'use_keras_save': False
    }

    model = Model(model_info, classifier, vectorizer, label_encoder,
                  one_hot_encoder)
    model_service.save_model(model)

    is_training = False
    classifier_ready = True
    logger.log('Training completed.\nResults:\nAverage: ' + avg_accuracy +
               '\nStandard Deviation: ' + std_dev)
コード例 #12
0
def train(dataset):
    global is_training
    global vectorizer
    global classifier
    global classifier_ready

    if is_training == True:
        raise TrainingInProgressError()

    logger.log('Starting training...')
    is_training = True
    classifier_ready = False

    df = pd.DataFrame(dataset.data,
                      columns=['sentiment', 'category', 'result'])

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=config.TRAINING_SET_SIZE)

    logger.log('Fitting the model.')
    start = time.time()

    # find_best_params(X, y)

    # classifier = KNeighborsClassifier(n_neighbors = 9, metric = 'euclidean', p = 1, weights='uniform') # acc: 51.91% std_dev: 3.76%
    # classifier = GaussianNB() # acc: 57.44% std_dev: 0.07%
    classifier = RandomForestClassifier(
        n_estimators=10, criterion='gini',
        n_jobs=-1)  # acc: 57.56% std_dev: 0.88%
    # classifier = build_ann() # acc: 56.79%

    classifier.fit(X_train, y_train)

    end = time.time()
    final = end - start

    logger.log('Fitting completed in ' + final + 's')

    logger.log('Determining accuracy.')

    # Acc for statistical models
    accuracies = cross_val_score(estimator=classifier,
                                 X=X_train,
                                 y=y_train,
                                 cv=10,
                                 n_jobs=-1)
    avg_accuracy = accuracies.mean() * 100
    std_dev = accuracies.std() * 100

    # Acc for ANN:
    # avg_accuracy = classifier.evaluate(X_test, y_test)
    # avg_accuracy = avg_accuracy * 100
    # std_dev = 0

    print('Saving results.')
    model_info = {
        '_id': str(uuid4()),
        'model_type': MODEL_TYPE,
        'has_vectorizor': True,
        'has_encoders': False,
        'is_current_model': True,
        'acc': avg_accuracy,
        'std_dev': std_dev,
        'use_keras_save': False
    }

    model = Model(model_info, classifier, vectorizer)
    model_service.save_model(model)

    is_training = False
    classifier_ready = True
    logger.log('Training completed.\nResults:\nAverage: ' + avg_accuracy +
               '\nStandard Deviation: ' + std_dev)
    logger.log('Debug')
コード例 #13
0
ファイル: app.py プロジェクト: punker76/angular-bokeh
# NOTE to switch on logging of the aiohttp server
# logging.basicConfig(level=logging.DEBUG)

_root_ = os.path.realpath(os.path.dirname(__file__))

debug.append(False)
port = 9000

if __name__ == '__main__':
    ptv = PythonToView()
    server = AiohttpServer(ptv)
    ptv.setServer(server)

    path = os.path.join(_root_, "../client/dist/dev")
    if not os.path.exists(path):
        logger.log(lvl="ERROR", msg="build the angular app first")
        exit()

    async def handle(request):
        return web.FileResponse(os.path.join(path, 'index.html'))

    # NOTE the app is already started in the server ....
    app = server.app
    app.add_routes([web.get('/', handle)])
    # NOTE for angular projects this is necessary ...
    app.add_routes([web.static('/', os.path.join(path, './'))])
    logger.log(lvl="INFO", msg="start server on localhost port %s" % port)
    logger.log(lvl="INFO", msg=f"path: {path}")
    runner = server.startService('0.0.0.0', port)
    interactionObservable = Observable()
    interaction = Interaction("ia", ptv, interactionObservable)