('MOV',2335), ('ATT',568), ('PROD',234), ('RES',539), #10900 records ('EG_NU',540), ('SemEval',540) } for dataset_name, max_sent_len in datasets: # Reading csv data # ================================================== print ("Reading text data for classification and building representations...") reviews = [] # reviews = [ ( row["text"] , row["polarity"] ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ] (body_all,rating_all)=LoadDataset_General.Load_Data(dataset_name) num_classes = len( set( rating_all ) ) body = list() rating = list() # for i in range(0, len(body_all)): # if rating_all[i] != 0: # body.append(body_all[i] ) # rating.append(rating_all[i]) # columns = {'body': body_all, 'rating': rating_all} data = pd.DataFrame(columns, columns = ['body', 'rating']) reviews = pd.DataFrame([[body_all, rating_all]]) ############### Preprocessing ########
# ('ATT',568), # ('PROD',234), # ('RES',539), #10900 records # ('RES1',539), #8000 records # ('RES2',539) #2600 records } for dataset_name, max_sent_len in datasets: # Reading csv data # ================================================== print( "Reading text data for classification and building representations...") reviews = [] # reviews = [ ( row["text"] , row["polarity"] ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ] (body_all, rating_all) = LoadDataset_General.Load_Data(dataset_name) num_classes = len(set(rating_all)) body = list() rating = list() for i in range(0, len(body_all)): if rating_all[i] != 0: body.append(body_all[i]) rating.append(rating_all[i]) columns = {'body': body, 'rating': rating} data = pd.DataFrame(columns, columns=['body', 'rating']) reviews = pd.DataFrame([[body, rating]]) #Remove duplication # data = pd.DataFrame.drop_duplicates(reviews)
rating_test = list() for dataset_name, max_sent_len in datasets: # Reading csv data # ================================================== print( "Reading text data for classification and building representations...") reviews = [] # reviews = [ ( row["text"] , row["polarity"] ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ] # (body_all,rating_all)=LoadDataset_General.Load_Data(dataset_name) for original_dataset in Original_datasets: # print('original_dataset= ',original_dataset) if original_dataset != dataset_name: (body_tmp, rating_tmp) = LoadDataset_General.Load_Data(original_dataset) # print('not equal') else: (body_tmp1, rating_tmp1) = LoadDataset_General.Load_Data(original_dataset) train_size = int(val_split * len(body_tmp1)) body_tmp = body_tmp1[0:train_size] rating_tmp = rating_tmp1[0:train_size] body_test = body_tmp1[train_size:] rating_test = rating_tmp1[train_size:] # print('equal') body_all.extend(body_tmp) rating_all.extend(rating_tmp) # print(original_dataset,':',len(body_all)) # print(train_size)
ngram_range=(1, 1), max_df=0.01)), # ('tfidf2',TfidfVectorizer(tokenizer=an.text_tokenize, ngram_range=(1, 2), max_df=0.01)), # ('tfidf3',TfidfVectorizer(tokenizer=an.text_tokenize, ngram_range=(1, 3), max_df=0.01)), } for dataset_name, max_sent_len in datasets: # Reading csv data # ================================================== print( "Reading text data for classification and building representations...") reviews = [] for original_dataset in Original_datasets: if original_dataset != dataset_name: (body_tmp, rating_tmp) = LoadDataset_General.Load_Data(original_dataset) else: if original_dataset not in ('EG_NU', 'SemEval'): (body_tmp1, rating_tmp1) = LoadDataset_General.Load_Data(original_dataset) train_size = int(val_split * len(body_tmp1)) body_tmp = body_tmp1[0:train_size] rating_tmp = rating_tmp1[0:train_size] body_test = body_tmp1[train_size:] rating_test = rating_tmp1[train_size:] body_testing = list() rating_testing = list() # for i in range(0, len(body_test)): # if rating_test[i] != 0: # body_testing.append(body_test[i] ) # rating_testing.append(rating_test[i])
('PROD', 234), ('RES', 539), #10900 records ('EG_NU', 540), ('SemEval', 540) } body_all = list() rating_all = list() trainset_sizes = list() #Training set for dataset_name, max_sent_len in datasets: print("Merging all training sets") reviews = [] if dataset_name not in ('EG_NU', 'SemEval'): (body_tmp1, rating_tmp1) = LoadDataset_General.Load_Data(dataset_name) train_size = int(val_split * len(body_tmp1)) trainset_sizes.append((dataset_name, len(body_tmp1), train_size)) body_tmp = body_tmp1[0:train_size] rating_tmp = rating_tmp1[0:train_size] body_test = body_tmp1[train_size:] rating_test = rating_tmp1[train_size:] else: (body_tmp, rating_tmp) = LoadDataset_General.Load_Data(dataset_name) (body_test, rating_test) = LoadDataset_General.Load_Data(dataset_name + '_test') body_all.extend(body_tmp) rating_all.extend(rating_tmp) num_classes = len(set(rating_all))
from sklearn.manifold import Isomap from sklearn.manifold import SpectralEmbedding from sklearn.decomposition import TruncatedSVD import nltk from nltk.stem.isri import ISRIStemmer from Classifiers import * from Feature_Generation import * import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer from pyarabic.named import * import sklearn.feature_selection ####### Load dataset ########## LoadDataset_General = LoadDataset_General() datasetName = 'BBN' (body, rating) = LoadDataset_General.Load_Data(datasetName) ############ Preprocessing ######## for i in range(0, len(body)): body[i] = LoadDataset_General.Emoticon_detection(body[i]) body[i] = LoadDataset_General.clean_raw_review(body[i]) body[i] = LoadDataset_General.normalizeArabic(body[i]) body[i] = LoadDataset_General.Elong_remove(body[i]) body[i] = LoadDataset_General.deNoise(body[i]) body[i] = LoadDataset_General.Remove_Stopwords(body[i]) body[i] = LoadDataset_General.Named_Entity_Recognition(body[i]) # body[i] = LoadDataset_General.Stem_word(body[i]) body[i] = LoadDataset_General.Light_Stem_word(body[i]) # body[i] = LoadDataset_General.Get_root_word(body[i]) #### Load unbalanced dataset
rating_tmp = list() body_test = list() rating_test = list() for dataset_name, max_sent_len in datasets: # Reading csv data # ================================================== print ("Reading text data for classification and building representations...") reviews = [] # reviews = [ ( row["text"] , row["polarity"] ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ] # (body_all,rating_all)=LoadDataset_General.Load_Data(dataset_name) for original_dataset in Original_datasets: # print('original_dataset= ',original_dataset) if original_dataset != dataset_name: (body_tmp,rating_tmp)=LoadDataset_General.Load_Data(original_dataset) # print('not equal') else: (body_tmp1,rating_tmp1)=LoadDataset_General.Load_Data(original_dataset) train_size = int(val_split * len(body_tmp1)) body_tmp = body_tmp1[0:train_size] rating_tmp = rating_tmp1[0:train_size] body_test = body_tmp1[train_size:] rating_test = rating_tmp1[train_size:] # print('equal') body_all.extend(body_tmp) rating_all.extend(rating_tmp) # print(original_dataset,':',len(body_all)) # print(train_size) num_classes = len( set( rating_all ) )