from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import MultinomialNB from tea.features import * from tea.load_data import parse_reviews from tea.run_models import run_grid_search if __name__ == "__main__": data = parse_reviews(load_data=False) X_train = data.drop(['polarity'], axis=1) y_train = data['polarity'] X_train_lemmatized = pd.DataFrame( LemmaExtractor(col_name='text').fit_transform(X_train)) final_pipeline = Pipeline([ ('features', FeatureUnion(transformer_list=[ ('text_length', TextLengthExtractor(col_name='text')), ('avg_token_length', WordLengthMetricsExtractor( col_name='text', metric='avg', split_type='simple')), ('std_token_length', WordLengthMetricsExtractor( col_name='text', metric='std', split_type='simple')),
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler from tea.evaluation import create_clf_report, create_benchmark_plot, plot_roc_binary from tea.features import * from tea.load_data import parse_reviews, get_df_stratified_split_in_train_validation if __name__ == "__main__": # loading data (train and test) train_data = parse_reviews(load_data=False, file_type='train') test_data = parse_reviews(load_data=False, file_type='test') split_metadata_dict = get_df_stratified_split_in_train_validation( data=train_data, label='polarity', validation_size=0.2, random_state=5) X_train = split_metadata_dict['x_train'] X_val = split_metadata_dict['x_validation'] X_test = test_data.drop(['polarity'], axis=1) y_train = split_metadata_dict['y_train'] y_val = split_metadata_dict['y_validation'] y_test = test_data['polarity'] X_train_val = pd.concat([X_train, X_val], axis=0) y_train_val = pd.concat([y_train, y_val], axis=0)