from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from tea.features import *
from tea.load_data import parse_reviews
from tea.run_models import run_grid_search

if __name__ == "__main__":
    data = parse_reviews(load_data=False)

    X_train = data.drop(['polarity'], axis=1)
    y_train = data['polarity']

    X_train_lemmatized = pd.DataFrame(
        LemmaExtractor(col_name='text').fit_transform(X_train))

    final_pipeline = Pipeline([
        ('features',
         FeatureUnion(transformer_list=[
             ('text_length', TextLengthExtractor(col_name='text')),
             ('avg_token_length',
              WordLengthMetricsExtractor(
                  col_name='text', metric='avg', split_type='simple')),
             ('std_token_length',
              WordLengthMetricsExtractor(
                  col_name='text', metric='std', split_type='simple')),
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from tea.evaluation import create_clf_report, create_benchmark_plot, plot_roc_binary
from tea.features import *
from tea.load_data import parse_reviews, get_df_stratified_split_in_train_validation

if __name__ == "__main__":
    # loading data (train and test)
    train_data = parse_reviews(load_data=False, file_type='train')
    test_data = parse_reviews(load_data=False, file_type='test')

    split_metadata_dict = get_df_stratified_split_in_train_validation(
        data=train_data, label='polarity', validation_size=0.2, random_state=5)

    X_train = split_metadata_dict['x_train']
    X_val = split_metadata_dict['x_validation']

    X_test = test_data.drop(['polarity'], axis=1)

    y_train = split_metadata_dict['y_train']
    y_val = split_metadata_dict['y_validation']

    y_test = test_data['polarity']

    X_train_val = pd.concat([X_train, X_val], axis=0)
    y_train_val = pd.concat([y_train, y_val], axis=0)