Esempio n. 1
0
def extract_data_league(params):

    db_manager = Database_Manager(params)

    # DATA EXTRACTION
    league_csv = db_manager.extract_data_league()

    # DATA PREPROCESSING
    input_data = data_preprocessing(league_csv, params)

    return league_csv, input_data
Esempio n. 2
0
    def test_preprocessing(self, data=None):
        d = self.data if data is None else data
        prep_data = data_preprocessing(
            d,
            punctuations=True,
            lowering=True,
            stemming=False,
            lemmatization=True,
            stop_words=True,
        )

        return prep_data
    def test_preprocessing(self):
        prep_data = data_preprocessing(self.data,
                                       'text',
                                       norm_contractions=False,
                                       norm_charsequences=False,
                                       twitter=False,
                                       links=True,
                                       norm_whitespaces=True,
                                       punctuations=False,
                                       lowering=False,
                                       stemming=False,
                                       lemmatization=False,
                                       stop_words=True)

        return prep_data
def preprocessing_oversampling_tdidf(data_path: Text,
                                     preprocessing_text: bool = False):

    data = extract_dataset(data_path)

    if preprocessing_text:
        data = data_preprocessing(data)

    x, y = data['Phrase'], data['Sentiment']

    x_tdidf, tdidf = tdidf_preprocessing(x,
                                         n_gram_range=(1, 3),
                                         max_features=100)

    x_smote, y_smote = smote_oversampling(x_tdidf.toarray(),
                                          y,
                                          random_state=2021)

    return x_smote, y_smote
Esempio n. 5
0
    def training_preprocessing(self):
        prep_data = data_preprocessing(
            self.data,
            feature='Phrase',
            punctuations=True,
            lowering=True,
            stemming=False,
            lemmatization=True,
            stop_words=True,
        )

        # Remove empty phrase
        prep_data = prep_data.drop(
            prep_data[prep_data['Phrase'].str.isspace()
                      & prep_data['Phrase'] == ''].index)

        # Remove duplicated phrases
        prep_data = prep_data.drop(prep_data[prep_data.duplicated()])

        self.prep_data = prep_data

        return prep_data
Esempio n. 6
0
    def training_preprocessing(self):
        prep_data = data_preprocessing(
            self.data,
            feature='text',
            norm_contractions=True,
            norm_charsequences=True,
            norm_whitespaces=True,
            norm_punctuation=True,
            punctuations=True,
            lowering=True,
            lemmatization=True,
            stop_words=True,
        )

        # Remove empty phrase
        prep_data = prep_data.drop(prep_data[prep_data['text'].str.isspace()
                                             & prep_data['text'] == ''].index)

        # Remove duplicated phrases
        prep_data = prep_data.drop(prep_data[prep_data.duplicated()].index)

        self.prep_data = prep_data

        return prep_data
def preprocessing_oversampling_tdidf(params):

    data_path = params.get('data_path')
    preprocessed = params.get('preprocessed')
    embedding_type = params.get('embedding')
    imbalance = params.get('imbalance')

    data = extract_dataset(data_path)

    if not preprocessed:
        data = data_preprocessing(data)

    x, y = data['Phrase'], data['Sentiment']

    # Data to Embedding
    if embedding_type == TDIDF_EMBEDDING:
        x_emb, tdidf = tdidf_preprocessing(x,
                                           n_gram_range=params['emb_params']['ngram_range'],
                                           max_features=params['emb_params']['max_features'])
        x_emb = x_emb.toarray()
    else:
        x_emb = None

    # Imbalance Data
    if imbalance == SMOTE_IMBALANCE:
        x_smote, y_smote = smote_oversampling(x_emb,
                                              y,
                                              random_state=params['imb_params']['random_state'],
                                              k_neighbors=params['imb_params']['k_neighbors'])

        x_data, y_data = x_smote, y_smote

    else:
        x_data, y_data = x_emb, y

    return x_data, y_data
Esempio n. 8
0
 def test_preprocessing(self):
     data = extract_dataset('../resources/kaggle/train.tsv')
     prep_data = data_preprocessing(data,
                                    # save_dir=PREPROCESSED_DATA_DIR
                                    )
     return prep_data