def extract_data_league(params): db_manager = Database_Manager(params) # DATA EXTRACTION league_csv = db_manager.extract_data_league() # DATA PREPROCESSING input_data = data_preprocessing(league_csv, params) return league_csv, input_data
def test_preprocessing(self, data=None): d = self.data if data is None else data prep_data = data_preprocessing( d, punctuations=True, lowering=True, stemming=False, lemmatization=True, stop_words=True, ) return prep_data
def test_preprocessing(self): prep_data = data_preprocessing(self.data, 'text', norm_contractions=False, norm_charsequences=False, twitter=False, links=True, norm_whitespaces=True, punctuations=False, lowering=False, stemming=False, lemmatization=False, stop_words=True) return prep_data
def preprocessing_oversampling_tdidf(data_path: Text, preprocessing_text: bool = False): data = extract_dataset(data_path) if preprocessing_text: data = data_preprocessing(data) x, y = data['Phrase'], data['Sentiment'] x_tdidf, tdidf = tdidf_preprocessing(x, n_gram_range=(1, 3), max_features=100) x_smote, y_smote = smote_oversampling(x_tdidf.toarray(), y, random_state=2021) return x_smote, y_smote
def training_preprocessing(self): prep_data = data_preprocessing( self.data, feature='Phrase', punctuations=True, lowering=True, stemming=False, lemmatization=True, stop_words=True, ) # Remove empty phrase prep_data = prep_data.drop( prep_data[prep_data['Phrase'].str.isspace() & prep_data['Phrase'] == ''].index) # Remove duplicated phrases prep_data = prep_data.drop(prep_data[prep_data.duplicated()]) self.prep_data = prep_data return prep_data
def training_preprocessing(self): prep_data = data_preprocessing( self.data, feature='text', norm_contractions=True, norm_charsequences=True, norm_whitespaces=True, norm_punctuation=True, punctuations=True, lowering=True, lemmatization=True, stop_words=True, ) # Remove empty phrase prep_data = prep_data.drop(prep_data[prep_data['text'].str.isspace() & prep_data['text'] == ''].index) # Remove duplicated phrases prep_data = prep_data.drop(prep_data[prep_data.duplicated()].index) self.prep_data = prep_data return prep_data
def preprocessing_oversampling_tdidf(params): data_path = params.get('data_path') preprocessed = params.get('preprocessed') embedding_type = params.get('embedding') imbalance = params.get('imbalance') data = extract_dataset(data_path) if not preprocessed: data = data_preprocessing(data) x, y = data['Phrase'], data['Sentiment'] # Data to Embedding if embedding_type == TDIDF_EMBEDDING: x_emb, tdidf = tdidf_preprocessing(x, n_gram_range=params['emb_params']['ngram_range'], max_features=params['emb_params']['max_features']) x_emb = x_emb.toarray() else: x_emb = None # Imbalance Data if imbalance == SMOTE_IMBALANCE: x_smote, y_smote = smote_oversampling(x_emb, y, random_state=params['imb_params']['random_state'], k_neighbors=params['imb_params']['k_neighbors']) x_data, y_data = x_smote, y_smote else: x_data, y_data = x_emb, y return x_data, y_data
def test_preprocessing(self): data = extract_dataset('../resources/kaggle/train.tsv') prep_data = data_preprocessing(data, # save_dir=PREPROCESSED_DATA_DIR ) return prep_data