def getWordLists(self): #https://github.com/alexrutherford/arabic_nlp/blob/master/get_sentiment.py import LoadDataset_General '''Reads terms to be tsted against text from files. Returns tuple of lists of words.''' # posWords=pd.read_csv(self.lexicon_path+'pos_words.txt') # negWords=pd.read_csv(self.lexicon_path+'neg_words_all.txt') posWords = pd.read_csv(self.lexicon_path_lex + 'Pos.txt') negWords = pd.read_csv(self.lexicon_path_lex + 'Neg.txt') # negFileAdd=pd.read_csv(lexicon_path+'neg_words_all.txt') negationWords = pd.read_csv(self.lexicon_path + 'negation_words.txt') posEmojis = pd.read_csv(self.lexicon_path + 'pos_emojis.txt') negEmojis = pd.read_csv(self.lexicon_path + 'neg_emojis.txt') posWords = posWords.iloc[:, 0].values.tolist() negWords = negWords.iloc[:, 0].values.tolist() for i in range(0, len(posWords)): posWords[ i] = LoadDataset_General.LoadDataset_General.normalizeArabic( LoadDataset_General.LoadDataset_General(), posWords[i]) posWords[i] = LoadDataset_General.LoadDataset_General.Elong_remove( LoadDataset_General.LoadDataset_General(), posWords[i]) posWords[ i] = LoadDataset_General.LoadDataset_General.Light_Stem_word( LoadDataset_General.LoadDataset_General(), posWords[i]) for i in range(0, len(negWords)): negWords[ i] = LoadDataset_General.LoadDataset_General.normalizeArabic( LoadDataset_General.LoadDataset_General(), negWords[i]) negWords[i] = LoadDataset_General.LoadDataset_General.Elong_remove( LoadDataset_General.LoadDataset_General(), negWords[i]) negWords[ i] = LoadDataset_General.LoadDataset_General.Light_Stem_word( LoadDataset_General.LoadDataset_General(), negWords[i]) posWords = [ x for i, x in enumerate(posWords) if i == posWords.index(x) ] negWords = [ x for i, x in enumerate(negWords) if i == negWords.index(x) ] return posWords, negWords, negationWords, posEmojis, negEmojis
# Training parameters # ================================================== num_epochs = 30 print("num_epochs = ", num_epochs) # Reading pre-trained word embeddings # ================================================== # load the embeddings print("") print("Reading pre-trained word embeddings...") embeddings = dict() #embeddings = Word2Vec.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore') #embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW') embeddings = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW') LoadDataset_General = LoadDataset_General() datasets = list() datasets = { # ('ASTD',40), #10000 records # ('LABR',882), ('BBN', 40) # , # ('SYR',40), # ('HTL',1110), # ('MOV',2335), # ('ATT',568), # ('PROD',234), # ('RES',539), #10900 records # ('RES1',539), #8000 records
# Reading pre-trained word embeddings # ================================================== # load the embeddings print("") print("Reading pre-trained word embeddings...") embeddings = dict() #embeddings = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore') #embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW') embeddings1 = gensim.models.KeyedVectors.load_word2vec_format( "E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True, encoding='utf8', unicode_errors='ignore') embeddings2 = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW') LoadDataset_General = LoadDataset_General() datasets = list() datasets = { # ('ASTD',40)#, #10000 records # ('BBN',40)#, # ('SYR',40)#, # ('HTL',1110), # ('MOV',2335)#, ('ATT', 568) #, # ('PROD',234)#, # ('RES',539), #10900 records # ('EG_NU',540)#, # ('SemEval',540) }
print("") print("Reading pre-trained word embeddings...") embeddings = dict() embeddings1 = gensim.models.KeyedVectors.load_word2vec_format( "C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True, encoding='utf8', unicode_errors='ignore') #embeddings2 = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW') #version 2 embeddings2 = gensim.models.Word2Vec.load( 'C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\v2\\tweet_cbow_300') #embeddings1 = gensim.models.KeyedVectors.load_word2vec_format("E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore') #embeddings2 = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW') LoadDataset_General = LoadDataset_General() datasets = list() datasets = { # ('ASTD',40), #10000 records ('BBN', 40) #, # ('SYR',40), # ('HTL',1110), # ('MOV',2335), # ('ATT',568), # ('PROD',234), # ('RES',539), #10900 records # ('EG_NU',540), # ('SemEval',540) }
# Reading pre-trained word embeddings # ================================================== # load the embeddings print("") print("Reading pre-trained word embeddings...") embeddings = dict() #embeddings = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore') #embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW') embeddings = gensim.models.KeyedVectors.load_word2vec_format( "E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True, encoding='utf8', unicode_errors='ignore') #embeddings = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW') LoadDataset_General = LoadDataset_General() datasets = list() datasets = { # ('ASTD',40), #10000 records ('BBN', 40) #, # ('SYR',40), # ('HTL',1110), # ('MOV',2335), # ('ATT',568), # ('PROD',234), # ('RES',539), #10900 records # ('EG_NU',540), # ('SemEval',540) }
from sklearn.grid_search import GridSearchCV from sklearn import cross_validation from sklearn.manifold import Isomap from sklearn.manifold import SpectralEmbedding from sklearn.decomposition import TruncatedSVD import nltk from nltk.stem.isri import ISRIStemmer from Classifiers import * from Feature_Generation import * import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer from pyarabic.named import * import sklearn.feature_selection ####### Load dataset ########## LoadDataset_General = LoadDataset_General() datasets = list() datasets = { # ('ASTD',40), #10000 records ('BBN', 40) #, # ('SYR',40), # ('HTL',1110), # ('MOV',2335), # ('ATT',568), # ('PROD',234), # ('RES',539), #10900 records # ('EG_NU',540), # ('SemEval',540) }
# Reading pre-trained word embeddings # ================================================== # load the embeddings print ("") print ("Reading pre-trained word embeddings...") embeddings = dict( ) #embeddings = Word2Vec.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore') #embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW') embeddings = gensim.models.KeyedVectors.load_word2vec_format("E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore') #embeddings = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW') LoadDataset_General = LoadDataset_General() datasets = list() datasets = { # ('ASTD',40), #10000 records ('BBN',40), # ('SYR',40), # ('HTL',1110), # ('MOV',2335)#, # ('ATT',568), # ('PROD',234), # ('RES',539), #10900 records # ('EG_NU',540), # ('SemEval',540) }
from sklearn.grid_search import GridSearchCV from sklearn import cross_validation from sklearn.manifold import Isomap from sklearn.manifold import SpectralEmbedding from sklearn.decomposition import TruncatedSVD import nltk from nltk.stem.isri import ISRIStemmer from Classifiers import * from Feature_Generation import * import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer from pyarabic.named import * import sklearn.feature_selection ####### Load dataset ########## LoadDataset_General = LoadDataset_General() datasetName = 'BBN' (body, rating) = LoadDataset_General.Load_Data(datasetName) ############ Preprocessing ######## for i in range(0, len(body)): body[i] = LoadDataset_General.Emoticon_detection(body[i]) body[i] = LoadDataset_General.clean_raw_review(body[i]) body[i] = LoadDataset_General.normalizeArabic(body[i]) body[i] = LoadDataset_General.Elong_remove(body[i]) body[i] = LoadDataset_General.deNoise(body[i]) body[i] = LoadDataset_General.Remove_Stopwords(body[i]) body[i] = LoadDataset_General.Named_Entity_Recognition(body[i]) # body[i] = LoadDataset_General.Stem_word(body[i]) body[i] = LoadDataset_General.Light_Stem_word(body[i]) # body[i] = LoadDataset_General.Get_root_word(body[i])
from LoadDataset_General import * from Lexicon_Generation import * import codecs import numpy as np import pandas as pd import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from qalsadi import analex from Classifiers import * from Feature_Generation import * import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer from pyarabic.named import * LoadDataset_General = LoadDataset_General() ############### Preprocessing ######## for i in range(0,len(data)): data.iloc[i,0] = LoadDataset_General.Emoticon_detection(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.clean_raw_review(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.normalizeArabic(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Elong_remove(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.deNoise(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Remove_Stopwords(data.iloc[i,0]) data.iloc[i,0] = LoadDataset_General.Named_Entity_Recognition(data.iloc[i,0]) # data[i] = LoadDataset_General.Stem_word(data[i]) # data.iloc[i,0] = LoadDataset_General.Light_Stem_word(data.iloc[i,0]) # data[i] = LoadDataset_General.Get_root_word(data[i]) data[0][2] = LoadDataset_General.Emoticon_detection(data[0][2])