data = pd.DataFrame(columns, columns=['body', 'rating']) reviews = pd.DataFrame([[body, rating]]) #Remove duplication # data = pd.DataFrame.drop_duplicates(reviews) ############### Preprocessing ######## for i in range(0, len(data)): data.iloc[i, 0] = LoadDataset_General.Emoticon_detection(data.iloc[i, 0]) data.iloc[i, 0] = LoadDataset_General.clean_raw_review(data.iloc[i, 0]) data.iloc[i, 0] = LoadDataset_General.normalizeArabic(data.iloc[i, 0]) data.iloc[i, 0] = LoadDataset_General.Elong_remove(data.iloc[i, 0]) data.iloc[i, 0] = LoadDataset_General.deNoise(data.iloc[i, 0]) data.iloc[i, 0] = LoadDataset_General.Remove_Stopwords(data.iloc[i, 0]) data.iloc[i, 0] = LoadDataset_General.Named_Entity_Recognition( data.iloc[i, 0]) # data[i] = LoadDataset_General.Stem_word(data[i]) # data.iloc[i,0] = LoadDataset_General.Light_Stem_word(data.iloc[i,0]) # data[i] = LoadDataset_General.Get_root_word(data[i]) # random.shuffle( data ) train_size = int(len(data) * val_split) train_texts = data.iloc[0:train_size, 0].tolist() test_texts = data.iloc[train_size:-1, 0].tolist() train_labels = data.iloc[0:train_size, 1].tolist() test_labels = data.iloc[train_size:-1, 1].tolist() num_classes = len(set(train_labels + test_labels)) tokenizer = Tokenizer(nb_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ") tokenizer.fit_on_texts(train_texts)
import sklearn.feature_selection ####### Load dataset ########## LoadDataset_General = LoadDataset_General() datasetName = 'BBN' (body, rating) = LoadDataset_General.Load_Data(datasetName) ############ Preprocessing ######## for i in range(0, len(body)): body[i] = LoadDataset_General.Emoticon_detection(body[i]) body[i] = LoadDataset_General.clean_raw_review(body[i]) body[i] = LoadDataset_General.normalizeArabic(body[i]) body[i] = LoadDataset_General.Elong_remove(body[i]) body[i] = LoadDataset_General.deNoise(body[i]) body[i] = LoadDataset_General.Remove_Stopwords(body[i]) body[i] = LoadDataset_General.Named_Entity_Recognition(body[i]) # body[i] = LoadDataset_General.Stem_word(body[i]) body[i] = LoadDataset_General.Light_Stem_word(body[i]) # body[i] = LoadDataset_General.Get_root_word(body[i]) #### Load unbalanced dataset (unbalanced_train_x, unbalanced_train_y, unbalanced_test_x, unbalanced_test_y, unbalanced_valid_x, unbalanced_valid_y ) = LoadDataset_General.get_train_test_validation_unbalanced( body, rating, datasetName) d_train = np.concatenate((unbalanced_train_x, unbalanced_valid_x)) Y_train = np.concatenate((unbalanced_train_y, unbalanced_valid_y)).tolist() Y_test = unbalanced_test_y ########## Feature Extraction using Tokenization ############ ###### Counts ##########