Ejemplo n.º 1
0
        ('MOV',2335),
		('ATT',568),
        ('PROD',234),
       ('RES',539), #10900 records
        ('EG_NU',540),
        ('SemEval',540)
        }

for dataset_name, max_sent_len in datasets:
    # Reading csv data
    # ==================================================
    print ("Reading text data for classification and building representations...")
    reviews = []
#    reviews = [ ( row["text"] , row["polarity"]  ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ]
    
    (body_all,rating_all)=LoadDataset_General.Load_Data(dataset_name)
    num_classes = len( set( rating_all ) )

    body = list()
    rating = list()
#    for i in range(0, len(body_all)):
#        if rating_all[i] != 0:
#            body.append(body_all[i] )
#            rating.append(rating_all[i])
#    
    columns = {'body': body_all, 'rating': rating_all}
    data = pd.DataFrame(columns, columns = ['body', 'rating'])
    reviews = pd.DataFrame([[body_all, rating_all]])
       
   
    ############### Preprocessing ########
Ejemplo n.º 2
0
    #		('ATT',568),
    #        ('PROD',234),
    #       ('RES',539), #10900 records
    #       ('RES1',539), #8000 records
    #		('RES2',539)		#2600 records
}

for dataset_name, max_sent_len in datasets:
    # Reading csv data
    # ==================================================
    print(
        "Reading text data for classification and building representations...")
    reviews = []
    #    reviews = [ ( row["text"] , row["polarity"]  ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ]

    (body_all, rating_all) = LoadDataset_General.Load_Data(dataset_name)
    num_classes = len(set(rating_all))

    body = list()
    rating = list()
    for i in range(0, len(body_all)):
        if rating_all[i] != 0:
            body.append(body_all[i])
            rating.append(rating_all[i])

    columns = {'body': body, 'rating': rating}
    data = pd.DataFrame(columns, columns=['body', 'rating'])
    reviews = pd.DataFrame([[body, rating]])

    #Remove duplication
    #    data = pd.DataFrame.drop_duplicates(reviews)
rating_test = list()

for dataset_name, max_sent_len in datasets:
    # Reading csv data
    # ==================================================
    print(
        "Reading text data for classification and building representations...")
    reviews = []
    #    reviews = [ ( row["text"] , row["polarity"]  ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ]

    #    (body_all,rating_all)=LoadDataset_General.Load_Data(dataset_name)
    for original_dataset in Original_datasets:
        #        print('original_dataset= ',original_dataset)
        if original_dataset != dataset_name:
            (body_tmp,
             rating_tmp) = LoadDataset_General.Load_Data(original_dataset)
#            print('not equal')
        else:
            (body_tmp1,
             rating_tmp1) = LoadDataset_General.Load_Data(original_dataset)
            train_size = int(val_split * len(body_tmp1))
            body_tmp = body_tmp1[0:train_size]
            rating_tmp = rating_tmp1[0:train_size]
            body_test = body_tmp1[train_size:]
            rating_test = rating_tmp1[train_size:]
#            print('equal')
        body_all.extend(body_tmp)
        rating_all.extend(rating_tmp)
#        print(original_dataset,':',len(body_all))
#        print(train_size)
                     ngram_range=(1, 1),
                     max_df=0.01)),
    #        ('tfidf2',TfidfVectorizer(tokenizer=an.text_tokenize, ngram_range=(1, 2), max_df=0.01)),
    #        ('tfidf3',TfidfVectorizer(tokenizer=an.text_tokenize, ngram_range=(1, 3), max_df=0.01)),
}

for dataset_name, max_sent_len in datasets:
    # Reading csv data
    # ==================================================
    print(
        "Reading text data for classification and building representations...")
    reviews = []
    for original_dataset in Original_datasets:
        if original_dataset != dataset_name:
            (body_tmp,
             rating_tmp) = LoadDataset_General.Load_Data(original_dataset)
        else:
            if original_dataset not in ('EG_NU', 'SemEval'):
                (body_tmp1,
                 rating_tmp1) = LoadDataset_General.Load_Data(original_dataset)
                train_size = int(val_split * len(body_tmp1))
                body_tmp = body_tmp1[0:train_size]
                rating_tmp = rating_tmp1[0:train_size]
                body_test = body_tmp1[train_size:]
                rating_test = rating_tmp1[train_size:]
                body_testing = list()
                rating_testing = list()
                #                for i in range(0, len(body_test)):
                #                    if rating_test[i] != 0:
                #                        body_testing.append(body_test[i] )
                #                        rating_testing.append(rating_test[i])
    ('PROD', 234),
    ('RES', 539),  #10900 records
    ('EG_NU', 540),
    ('SemEval', 540)
}

body_all = list()
rating_all = list()
trainset_sizes = list()
#Training set
for dataset_name, max_sent_len in datasets:
    print("Merging all training sets")
    reviews = []

    if dataset_name not in ('EG_NU', 'SemEval'):
        (body_tmp1, rating_tmp1) = LoadDataset_General.Load_Data(dataset_name)
        train_size = int(val_split * len(body_tmp1))
        trainset_sizes.append((dataset_name, len(body_tmp1), train_size))
        body_tmp = body_tmp1[0:train_size]
        rating_tmp = rating_tmp1[0:train_size]
        body_test = body_tmp1[train_size:]
        rating_test = rating_tmp1[train_size:]
    else:
        (body_tmp, rating_tmp) = LoadDataset_General.Load_Data(dataset_name)
        (body_test,
         rating_test) = LoadDataset_General.Load_Data(dataset_name + '_test')

    body_all.extend(body_tmp)
    rating_all.extend(rating_tmp)

num_classes = len(set(rating_all))
Ejemplo n.º 6
0
from sklearn.manifold import Isomap
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.stem.isri import ISRIStemmer
from Classifiers import *
from Feature_Generation import *
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
from pyarabic.named import *
import sklearn.feature_selection

####### Load dataset ##########
LoadDataset_General = LoadDataset_General()
datasetName = 'BBN'
(body, rating) = LoadDataset_General.Load_Data(datasetName)

############ Preprocessing ########
for i in range(0, len(body)):
    body[i] = LoadDataset_General.Emoticon_detection(body[i])
    body[i] = LoadDataset_General.clean_raw_review(body[i])
    body[i] = LoadDataset_General.normalizeArabic(body[i])
    body[i] = LoadDataset_General.Elong_remove(body[i])
    body[i] = LoadDataset_General.deNoise(body[i])
    body[i] = LoadDataset_General.Remove_Stopwords(body[i])
    body[i] = LoadDataset_General.Named_Entity_Recognition(body[i])
    #    body[i] = LoadDataset_General.Stem_word(body[i])
    body[i] = LoadDataset_General.Light_Stem_word(body[i])
#    body[i] = LoadDataset_General.Get_root_word(body[i])

#### Load unbalanced dataset
rating_tmp = list()
body_test = list()
rating_test = list()

for dataset_name, max_sent_len in datasets:
    # Reading csv data
    # ==================================================
    print ("Reading text data for classification and building representations...")
    reviews = []
#    reviews = [ ( row["text"] , row["polarity"]  ) for row in csv.DictReader(open(file_name, encoding="utf8"), delimiter=',', quoting=csv.QUOTE_NONE) ]
    
#    (body_all,rating_all)=LoadDataset_General.Load_Data(dataset_name)
    for original_dataset in Original_datasets:
#        print('original_dataset= ',original_dataset)
        if original_dataset != dataset_name:
            (body_tmp,rating_tmp)=LoadDataset_General.Load_Data(original_dataset)
#            print('not equal')
        else:
            (body_tmp1,rating_tmp1)=LoadDataset_General.Load_Data(original_dataset)
            train_size = int(val_split * len(body_tmp1))
            body_tmp = body_tmp1[0:train_size]
            rating_tmp = rating_tmp1[0:train_size]
            body_test = body_tmp1[train_size:]
            rating_test = rating_tmp1[train_size:]
#            print('equal')
        body_all.extend(body_tmp)
        rating_all.extend(rating_tmp)
#        print(original_dataset,':',len(body_all))
#        print(train_size)

    num_classes = len( set( rating_all ) )