def build_lstm(sequence_length, n_words, starting_output_dim, batch_size=100): mem_size_GB = ((starting_output_dim * sequence_length * batch_size * 8) / 1000000000) logger.log(f'Current network will take up {mem_size_GB} GB of memory.') if mem_size_GB > config.MAX_MEMORY_GB: raise Exception( 'Network is configured to use more than available RAM. Total size of current network: ' + str(mem_size_GB) + 'GB') classifier = NeuralNetwork('SEQUENTIAL', epochs=40, batch_size=batch_size) classifier.add( Embedding(input_dim=n_words, output_dim=starting_output_dim, input_length=sequence_length, trainable=True)) # Embed the text sequences classifier.add( LSTM(units=int(starting_output_dim / 2), return_sequences=True)) classifier.add( LSTM(units=int(starting_output_dim / 4), return_sequences=False)) classifier.add(Dense(units=1, activation='sigmoid')) # Other optimizers: rmsprop, adagrad, adam, adadelta, adamax, nadam, SGD(lr=0.01) classifier.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.0), loss='binary_crossentropy', metrics=['accuracy']) return classifier
def do_it(self, a): try: inp = json.loads(a) is_json = True except: is_json = False logger.log(lvl="INFO", msg="from angular: %s" % a[0:100], orig=self.cn) if is_json: fct = inp["name"] args = inp["args"] user = inp["user"] if self.sleep and fct not in modalFct: return try: method = getattr(self, fct) if inspect.iscoroutinefunction(method): aio.ensure_future(method(*args, user=user)) else: return method(*args, user=user) except: exc_type, exc_value, exc_traceback = sys.exc_info() formatted_lines = traceback.format_exc().splitlines() return ( "ERROR no valid json-function call from junction via .. %s \n %s \n %s \n %s" % (fct, exc_type, exc_value, formatted_lines))
def save(self, file_path): try: self.model.save(file_path + '.h5') logger.log('Model saved.') return True except Exception as e: logger.log_error(e) return False
def load(self, file_path): try: self.model = load_model(file_path + '.h5') logger.log('Model loaded.') self.model.summary() return True except Exception as e: logger.log_error(e) return False
def train(dataset): global is_training global vectorizer global classifier global classifier_ready if is_training == True: raise TrainingInProgressError() logger.log('Starting training...') is_training = True classifier_ready = False text, values = zip(*dataset.data) dataset_size = len(text) train_size = 0.9 if dataset_size <= config.MAX_DATASET_SIZE else config.MAX_DATASET_SIZE X_train, X_discard, y_train, y_discard = train_test_split( text, values, train_size=train_size) logger.log('Vectorizing the data.') vectorizer = CountVectorizer( max_features=config.SENTIMENT_BAG_OF_WORDS_SIZE) X = vectorizer.fit_transform(X_train).toarray() y = y_train logger.log('Fitting the model.') classifier = GaussianNB() classifier.fit(X, y) logger.log('Determining accuracy.') accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10, pre_dispatch=8) avg_accuracy = accuracies.mean() * 100 std_dev = accuracies.std() * 100 print('Saving results.') model_info = { '_id': str(uuid4()), 'model_type': MODEL_TYPE, 'has_vectorizor': True, 'is_current_model': True, 'acc': avg_accuracy, 'std_dev': std_dev } model = Model(model_info, classifier, vectorizer) model_service.save_model(model) is_training = False classifier_ready = True logger.log('Training completed.\nResults:\nAverage: ' + avg_accuracy + '\nStandard Deviation: ' + std_dev)
def parse(self, item): if debug[-1]: logger.log(lvl="INFO", msg="message received: %s" % item, orig=self.cn) if item.startswith("send") or item.startswith("client"): return super(WebsocketObserver, self).parse(item) else: out = self.ptv.do_it(item) # in case of error if type(out) is str: if "ERROR" in out: print(out) return False, None, None
def clean(data): global is_cleaning if is_cleaning == True: raise CleaningInProgressError() if len(data) > config.SENTIMENT_SINGLE_THREAD_CUTOFF: logger.log('Cleaning text.') is_cleaning = True pool = Pool(processes=8) clean_data = pool.map(clean_single, data) is_cleaning = False logger.log('Cleaning complete.') else: clean_data = map(clean_single, data) return list(clean_data)
def clean(data): # Clean the text if len(data) > config.CLEANER_SINGLE_THREAD_CUTOFF: logger.log('Cleaning text.') pool = Pool(processes=8) clean_data = pool.map(clean_text_single, data) logger.log('Cleaning complete.') else: logger.log('Cleaning text.') clean_data = map(clean_text_single, data) logger.log('Cleaning complete.') #endif return list(clean_data) # clean()
def find_best_params(X, y): logger.log('Finding the best parameters') # parameters for KNeighborsClassifier # parameters = { # 'n_neighbors': range(3, 20), # best: 16 # 'weights': ['uniform', 'distance'], # best: distance # 'metric': ['euclidean', 'minkowski', 'manhattan'], # best: minkowski # 'p': range(1,5) # best: 1 # } # parameters for RandomForestClassifier # parameters = { # 'n_estimators': [1, 5, 10, 100, 250, 500], # best: 500 # 'criterion': ['gini', 'entropy'] # best: entropy # } parameters = { 'n_estimators': [500, 750, 1000, 2000], # best: 500 'criterion': ['entropy'] # best: entropy } start = time.time() grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, scoring='accuracy', cv=10, pre_dispatch=8, n_jobs=-1) grid_search = grid_search.fit(X, y) end = time.time() final_time = end - start best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ logger.log( 'Best accuracy: ' + best_accuracy + '\nBest Parameters: ' + best_parameters + '\nTraining complete. In order to save model please re-run with the given parameters.' ) logger.log('Debug here')
def train(dataset): global is_training global tokenizer global classifier global classifier_ready if is_training == True: raise TrainingInProgressError() logger.log('Starting training...') is_training = True classifier_ready = False X, y = zip(*dataset.data) y = np.asarray(y) # Note: This should only be used for Embedded models logger.log('Encoding the data.') tokenizer = Tokenizer(num_words=config.STOCK_V2_BAG_OF_WORDS_SIZE) tokenizer.fit_on_texts(X) X = np.array([np.array(xi) for xi in tokenizer.texts_to_sequences(X)]) X = pad_sequences(X, padding='post', value=0, maxlen=config.MAX_SEQUENCE_LENGTH ) # Only ~3% of articles are greater than 500 words sequence_length = len(X[0]) tokenizer.max_length = sequence_length n_words = config.STOCK_V2_BAG_OF_WORDS_SIZE X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=config.TRAINING_SET_SIZE) logger.log('Fitting the model.') start = time.time() classifier = build_lstm(sequence_length, n_words, starting_output_dim=3000, batch_size=100) # acc: classifier.fit(X_train, y_train) end = time.time() final = end - start logger.log('Fitting completed in ' + str(final) + 's') # find_best_params(X, y) logger.log('Determining accuracy.') # Acc for ANN: avg_accuracy = classifier.evaluate(X_test, y_test) avg_accuracy = avg_accuracy * 100 std_dev = 0 print('Saving results.') model_info = { '_id': str(uuid4()), 'model_type': MODEL_TYPE, 'is_current_model': True, 'acc': avg_accuracy, 'std_dev': std_dev, 'use_keras_save': True } model = Model(model_info, classifier, tokenizer=tokenizer) model_service.save_model(model) is_training = False classifier_ready = True logger.log('Training completed.\nResults:\nAverage: ' + str(avg_accuracy) + '\nStandard Deviation: ' + str(std_dev)) logger.log('Debugger')
def train(dataset): global is_training global vectorizer global classifier global classifier_ready global label_encoder global one_hot_encoder if is_training == True: raise TrainingInProgressError() logger.log('Starting training...') is_training = True classifier_ready = False text, values = zip(*dataset.data) X_train, X_test, y_train, y_test = train_test_split( text, values, train_size=config.TRAINING_SET_SIZE) logger.log('Encoding the data.') label_encoder = LabelEncoder() y_train_labeled = label_encoder.fit_transform(y_train) y_test_labeled = label_encoder.transform(y_test) one_hot_encoder = OneHotEncoder(drop='first', handle_unknown='error', categories='auto') y_train_encoded = one_hot_encoder.fit_transform( y_train_labeled.reshape(-1, 1)).toarray() y_test_encoded = one_hot_encoder.transform(y_test_labeled.reshape( -1, 1)).toarray() logger.log('Vectorizing the data.') # vectorizer = CountVectorizer(max_features=config.CATEGORY_BAG_OF_WORDS_SIZE) vectorizer = TfidfVectorizer( max_features=config.CATEGORY_BAG_OF_WORDS_SIZE) X_train = vectorizer.fit_transform(X_train).toarray() y_train = y_train_encoded num_categories = len(y_train[0]) X_test = vectorizer.transform(X_test).toarray() y_test = y_test_encoded # y_train = y_labeled # Note this is only for classifiers that do not support multilabel logger.log('Fitting the model.') start = time.time() # classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 1, weights='distance') # acc: 44%, 25% classifier = RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1) # acc: 64%, 96% # classifier = GaussianNB() # acc: 30%, 3% # ANN classifier acc: 67%, 81% # classifier = NeuralNetwork'SEQUENTIAL', 10) # classifier.add(Dense(activation='relu', input_dim=config.CATEGORY_BAG_OF_WORDS_SIZE, units=250)) # classifier.add(Dense(activation='softmax', units=num_categories)) # Other optimizers: rmsprop, adagrad, adam, adadelta, adamax, nadam, SGD(lr=0.01) # optimizer = 'adadelta' # Best: adadelta # classifier.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) classifier.fit(X_train, y_train) end = time.time() final = end - start logger.log('Fitting completed in ' + final + 's') # logger.log('Finding the best parameters') # parameters for KNeighborsClassifier # parameters = { # 'n_neighbors': range(3, 20), # best: 3 # 'weights': ['uniform', 'distance'], # best: distance # 'metric': ['euclidean', 'minkowski', 'manhattan'], # best: minkowski # 'p': range(1,5) # best: 1 # } # parameters for RandomForestClassifier # parameters = { # 'n_estimators': [100], # best: 100 # 'criterion': ['gini', 'entropy'] # best: gini # } # start = time.time() # grid_search = GridSearchCV(estimator = RandomForestClassifier(), # param_grid = parameters, # scoring = 'accuracy', # cv = 10, # pre_dispatch=8, # n_jobs=-1) # grid_search = grid_search.fit(X, y) # end = time.time() # best_accuracy = grid_search.best_score_ # best_parameters = grid_search.best_params_ # logger.log(f'Best accuracy: {best_accuracy}\nBest Parameters: {best_parameters}\nTraining complete. In order to save model please re-run with the given parameters.') logger.log('Determining accuracy.') # Acc for statistical models accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train_labeled, cv=10, n_jobs=-1) avg_accuracy = accuracies.mean() * 100 std_dev = accuracies.std() * 100 # Acc for ANN: # avg_accuracy = classifier.evaluate(X_test, y_test) # avg_accuracy = avg_accuracy * 100 # std_dev = 0 print('Saving results.') model_info = { '_id': str(uuid4()), 'model_type': MODEL_TYPE, 'has_vectorizor': True, 'has_encoders': True, 'is_current_model': True, 'acc': avg_accuracy, 'std_dev': std_dev, 'use_keras_save': False } model = Model(model_info, classifier, vectorizer, label_encoder, one_hot_encoder) model_service.save_model(model) is_training = False classifier_ready = True logger.log('Training completed.\nResults:\nAverage: ' + avg_accuracy + '\nStandard Deviation: ' + std_dev)
def train(dataset): global is_training global vectorizer global classifier global classifier_ready if is_training == True: raise TrainingInProgressError() logger.log('Starting training...') is_training = True classifier_ready = False df = pd.DataFrame(dataset.data, columns=['sentiment', 'category', 'result']) X = df.iloc[:, :-1].values y = df.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=config.TRAINING_SET_SIZE) logger.log('Fitting the model.') start = time.time() # find_best_params(X, y) # classifier = KNeighborsClassifier(n_neighbors = 9, metric = 'euclidean', p = 1, weights='uniform') # acc: 51.91% std_dev: 3.76% # classifier = GaussianNB() # acc: 57.44% std_dev: 0.07% classifier = RandomForestClassifier( n_estimators=10, criterion='gini', n_jobs=-1) # acc: 57.56% std_dev: 0.88% # classifier = build_ann() # acc: 56.79% classifier.fit(X_train, y_train) end = time.time() final = end - start logger.log('Fitting completed in ' + final + 's') logger.log('Determining accuracy.') # Acc for statistical models accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10, n_jobs=-1) avg_accuracy = accuracies.mean() * 100 std_dev = accuracies.std() * 100 # Acc for ANN: # avg_accuracy = classifier.evaluate(X_test, y_test) # avg_accuracy = avg_accuracy * 100 # std_dev = 0 print('Saving results.') model_info = { '_id': str(uuid4()), 'model_type': MODEL_TYPE, 'has_vectorizor': True, 'has_encoders': False, 'is_current_model': True, 'acc': avg_accuracy, 'std_dev': std_dev, 'use_keras_save': False } model = Model(model_info, classifier, vectorizer) model_service.save_model(model) is_training = False classifier_ready = True logger.log('Training completed.\nResults:\nAverage: ' + avg_accuracy + '\nStandard Deviation: ' + std_dev) logger.log('Debug')
# NOTE to switch on logging of the aiohttp server # logging.basicConfig(level=logging.DEBUG) _root_ = os.path.realpath(os.path.dirname(__file__)) debug.append(False) port = 9000 if __name__ == '__main__': ptv = PythonToView() server = AiohttpServer(ptv) ptv.setServer(server) path = os.path.join(_root_, "../client/dist/dev") if not os.path.exists(path): logger.log(lvl="ERROR", msg="build the angular app first") exit() async def handle(request): return web.FileResponse(os.path.join(path, 'index.html')) # NOTE the app is already started in the server .... app = server.app app.add_routes([web.get('/', handle)]) # NOTE for angular projects this is necessary ... app.add_routes([web.static('/', os.path.join(path, './'))]) logger.log(lvl="INFO", msg="start server on localhost port %s" % port) logger.log(lvl="INFO", msg=f"path: {path}") runner = server.startService('0.0.0.0', port) interactionObservable = Observable() interaction = Interaction("ia", ptv, interactionObservable)