Beispiel #1
0
def training(features, target, cycle):
    try:
        if cycle != 1:
            model = repository.get(SGDRegressor.__name__, DIR_PATH)
            scaler = repository.get(MaxAbsScaler.__name__, DIR_PATH)
        else:
            model = SGDRegressor(loss='epsilon_insensitive', penalty='l2')
            scaler = MaxAbsScaler()

        scaler.partial_fit(features)

        features = scaler.transform(features)

        model.partial_fit(features, target)

        repository.create(model, SGDRegressor.__name__, DIR_PATH)
        repository.create(scaler, MaxAbsScaler.__name__, DIR_PATH)

        return {'message': 'training successful '}

    except Exception as e:
        traceback.print_tb(e.__traceback__)
        return {'message': 'training failed '}
            train_text_dtm = text_vectorizer.transform(train['Text'])

            #Non-negative matrix factorisation to identify topics in Text
            #TO DO

            #Remove text columns that have already been converted into numeric features
            train_features = train.drop(['Text', 'Summary'], axis='columns')

            #Convert features to sparse matrix
            train_features = csr_matrix(train_features.values)

            #Combine sparse matrices
            train = hstack([train_summary_dtm, train_text_dtm, train_features])

            #Scale
            scaler.partial_fit(train)

        #Loop through chunks for training
        for reviews in pd.read_csv('Reviews.csv',
                                   index_col='Id',
                                   usecols=['Id', 'Summary', 'Text', 'Score'],
                                   chunksize=chunksize):

            #Only need training data
            train = reviews.iloc[reviews.index.isin(train_indices[fold])]

            #Continue to next iteration if there is no data in this chunk
            if train.shape[0] == 0:
                continue

            #Drop duplicate score-text values