def training(features, target, cycle): try: if cycle != 1: model = repository.get(SGDRegressor.__name__, DIR_PATH) scaler = repository.get(MaxAbsScaler.__name__, DIR_PATH) else: model = SGDRegressor(loss='epsilon_insensitive', penalty='l2') scaler = MaxAbsScaler() scaler.partial_fit(features) features = scaler.transform(features) model.partial_fit(features, target) repository.create(model, SGDRegressor.__name__, DIR_PATH) repository.create(scaler, MaxAbsScaler.__name__, DIR_PATH) return {'message': 'training successful '} except Exception as e: traceback.print_tb(e.__traceback__) return {'message': 'training failed '}
train_text_dtm = text_vectorizer.transform(train['Text']) #Non-negative matrix factorisation to identify topics in Text #TO DO #Remove text columns that have already been converted into numeric features train_features = train.drop(['Text', 'Summary'], axis='columns') #Convert features to sparse matrix train_features = csr_matrix(train_features.values) #Combine sparse matrices train = hstack([train_summary_dtm, train_text_dtm, train_features]) #Scale scaler.partial_fit(train) #Loop through chunks for training for reviews in pd.read_csv('Reviews.csv', index_col='Id', usecols=['Id', 'Summary', 'Text', 'Score'], chunksize=chunksize): #Only need training data train = reviews.iloc[reviews.index.isin(train_indices[fold])] #Continue to next iteration if there is no data in this chunk if train.shape[0] == 0: continue #Drop duplicate score-text values