def getWordLists(self):
     #https://github.com/alexrutherford/arabic_nlp/blob/master/get_sentiment.py
     import LoadDataset_General
     '''Reads terms to be tsted against text from files. Returns tuple of lists of words.'''
     #        posWords=pd.read_csv(self.lexicon_path+'pos_words.txt')
     #        negWords=pd.read_csv(self.lexicon_path+'neg_words_all.txt')
     posWords = pd.read_csv(self.lexicon_path_lex + 'Pos.txt')
     negWords = pd.read_csv(self.lexicon_path_lex + 'Neg.txt')
     #        negFileAdd=pd.read_csv(lexicon_path+'neg_words_all.txt')
     negationWords = pd.read_csv(self.lexicon_path + 'negation_words.txt')
     posEmojis = pd.read_csv(self.lexicon_path + 'pos_emojis.txt')
     negEmojis = pd.read_csv(self.lexicon_path + 'neg_emojis.txt')
     posWords = posWords.iloc[:, 0].values.tolist()
     negWords = negWords.iloc[:, 0].values.tolist()
     for i in range(0, len(posWords)):
         posWords[
             i] = LoadDataset_General.LoadDataset_General.normalizeArabic(
                 LoadDataset_General.LoadDataset_General(), posWords[i])
         posWords[i] = LoadDataset_General.LoadDataset_General.Elong_remove(
             LoadDataset_General.LoadDataset_General(), posWords[i])
         posWords[
             i] = LoadDataset_General.LoadDataset_General.Light_Stem_word(
                 LoadDataset_General.LoadDataset_General(), posWords[i])
     for i in range(0, len(negWords)):
         negWords[
             i] = LoadDataset_General.LoadDataset_General.normalizeArabic(
                 LoadDataset_General.LoadDataset_General(), negWords[i])
         negWords[i] = LoadDataset_General.LoadDataset_General.Elong_remove(
             LoadDataset_General.LoadDataset_General(), negWords[i])
         negWords[
             i] = LoadDataset_General.LoadDataset_General.Light_Stem_word(
                 LoadDataset_General.LoadDataset_General(), negWords[i])
     posWords = [
         x for i, x in enumerate(posWords) if i == posWords.index(x)
     ]
     negWords = [
         x for i, x in enumerate(negWords) if i == negWords.index(x)
     ]
     return posWords, negWords, negationWords, posEmojis, negEmojis
Ejemplo n.º 2
0
# Training parameters
# ==================================================
num_epochs = 30
print("num_epochs = ", num_epochs)

# Reading pre-trained word embeddings
# ==================================================
# load the embeddings
print("")
print("Reading pre-trained word embeddings...")
embeddings = dict()
#embeddings = Word2Vec.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore')
#embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW')
embeddings = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW')

LoadDataset_General = LoadDataset_General()

datasets = list()

datasets = {
    #        ('ASTD',40), #10000 records
    #		('LABR',882),
    ('BBN', 40)
    #        ,
    #		('SYR',40),
    #       ('HTL',1110),
    #        ('MOV',2335),
    #		('ATT',568),
    #        ('PROD',234),
    #       ('RES',539), #10900 records
    #       ('RES1',539), #8000 records
# Reading pre-trained word embeddings
# ==================================================
# load the embeddings
print("")
print("Reading pre-trained word embeddings...")
embeddings = dict()
#embeddings = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore')
#embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW')
embeddings1 = gensim.models.KeyedVectors.load_word2vec_format(
    "E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin",
    binary=True,
    encoding='utf8',
    unicode_errors='ignore')
embeddings2 = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW')

LoadDataset_General = LoadDataset_General()

datasets = list()

datasets = {
    #       ('ASTD',40)#, #10000 records
    #		('BBN',40)#,
    #		('SYR',40)#,
    #       ('HTL',1110),
    #       ('MOV',2335)#,
    ('ATT', 568)  #,
    #       ('PROD',234)#,
    #       ('RES',539), #10900 records
    #       ('EG_NU',540)#,
    #       ('SemEval',540)
}
print("")
print("Reading pre-trained word embeddings...")
embeddings = dict()
embeddings1 = gensim.models.KeyedVectors.load_word2vec_format(
    "C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin",
    binary=True,
    encoding='utf8',
    unicode_errors='ignore')
#embeddings2 = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW')
#version 2
embeddings2 = gensim.models.Word2Vec.load(
    'C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\v2\\tweet_cbow_300')
#embeddings1 = gensim.models.KeyedVectors.load_word2vec_format("E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore')
#embeddings2 = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW')

LoadDataset_General = LoadDataset_General()

datasets = list()

datasets = {
    #       ('ASTD',40), #10000 records
    ('BBN', 40)  #,
    #		('SYR',40),
    #       ('HTL',1110),
    #       ('MOV',2335),
    #		('ATT',568),
    #       ('PROD',234),
    #       ('RES',539), #10900 records
    #       ('EG_NU',540),
    #       ('SemEval',540)
}
# Reading pre-trained word embeddings
# ==================================================
# load the embeddings
print("")
print("Reading pre-trained word embeddings...")
embeddings = dict()
#embeddings = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore')
#embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW')
embeddings = gensim.models.KeyedVectors.load_word2vec_format(
    "E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin",
    binary=True,
    encoding='utf8',
    unicode_errors='ignore')
#embeddings = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW')

LoadDataset_General = LoadDataset_General()

datasets = list()

datasets = {
    #       ('ASTD',40), #10000 records
    ('BBN', 40)  #,
    #		('SYR',40),
    #       ('HTL',1110),
    #       ('MOV',2335),
    #		('ATT',568),
    #       ('PROD',234),
    #       ('RES',539), #10900 records
    #       ('EG_NU',540),
    #       ('SemEval',540)
}
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn.manifold import Isomap
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.stem.isri import ISRIStemmer
from Classifiers import *
from Feature_Generation import *
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
from pyarabic.named import *
import sklearn.feature_selection

####### Load dataset ##########
LoadDataset_General = LoadDataset_General()

datasets = list()

datasets = {
    #       ('ASTD',40), #10000 records
    ('BBN', 40)  #,
    #		('SYR',40),
    #       ('HTL',1110),
    #       ('MOV',2335),
    #		('ATT',568),
    #       ('PROD',234),
    #       ('RES',539), #10900 records
    #       ('EG_NU',540),
    #       ('SemEval',540)
}

    
# Reading pre-trained word embeddings
# ==================================================
# load the embeddings
print ("")
print ("Reading pre-trained word embeddings...")
embeddings = dict( )
#embeddings = Word2Vec.load_word2vec_format("C:\\Users\\paperspace\\Desktop\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore')
#embeddings = gensim.models.Word2Vec.load('C:\\Users\\paperspace\\Desktop\\Twt-CBOW\\Twt-CBOW')
embeddings = gensim.models.KeyedVectors.load_word2vec_format("E:\\data\\cbow\\(CBOW58)-ASA-3B-CBOW-window5-3iter-d300-vecotrs.bin", binary=True,encoding='utf8', unicode_errors='ignore')
#embeddings = gensim.models.Word2Vec.load('E:\\data\\aravec\\Twt-CBOW')


LoadDataset_General = LoadDataset_General()

datasets = list()

datasets = {
#       ('ASTD',40), #10000 records
		('BBN',40),
#		('SYR',40),
#       ('HTL',1110),
#       ('MOV',2335)#,
#		('ATT',568),
#       ('PROD',234),
#       ('RES',539), #10900 records
#       ('EG_NU',540),
#       ('SemEval',540)
        }
Ejemplo n.º 8
0
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn.manifold import Isomap
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.stem.isri import ISRIStemmer
from Classifiers import *
from Feature_Generation import *
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
from pyarabic.named import *
import sklearn.feature_selection

####### Load dataset ##########
LoadDataset_General = LoadDataset_General()
datasetName = 'BBN'
(body, rating) = LoadDataset_General.Load_Data(datasetName)

############ Preprocessing ########
for i in range(0, len(body)):
    body[i] = LoadDataset_General.Emoticon_detection(body[i])
    body[i] = LoadDataset_General.clean_raw_review(body[i])
    body[i] = LoadDataset_General.normalizeArabic(body[i])
    body[i] = LoadDataset_General.Elong_remove(body[i])
    body[i] = LoadDataset_General.deNoise(body[i])
    body[i] = LoadDataset_General.Remove_Stopwords(body[i])
    body[i] = LoadDataset_General.Named_Entity_Recognition(body[i])
    #    body[i] = LoadDataset_General.Stem_word(body[i])
    body[i] = LoadDataset_General.Light_Stem_word(body[i])
#    body[i] = LoadDataset_General.Get_root_word(body[i])
Ejemplo n.º 9
0
from LoadDataset_General import *
from Lexicon_Generation import *
import codecs
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from qalsadi import analex
from Classifiers import *
from Feature_Generation import *
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
from pyarabic.named import *

LoadDataset_General = LoadDataset_General()
############### Preprocessing ########
for i in range(0,len(data)):
    data.iloc[i,0] = LoadDataset_General.Emoticon_detection(data.iloc[i,0])
    data.iloc[i,0] = LoadDataset_General.clean_raw_review(data.iloc[i,0])
    data.iloc[i,0] = LoadDataset_General.normalizeArabic(data.iloc[i,0])
    data.iloc[i,0] = LoadDataset_General.Elong_remove(data.iloc[i,0])
    data.iloc[i,0] = LoadDataset_General.deNoise(data.iloc[i,0])
    data.iloc[i,0] = LoadDataset_General.Remove_Stopwords(data.iloc[i,0])
    data.iloc[i,0] = LoadDataset_General.Named_Entity_Recognition(data.iloc[i,0])
#    data[i] = LoadDataset_General.Stem_word(data[i])
#    data.iloc[i,0] = LoadDataset_General.Light_Stem_word(data.iloc[i,0])
#    data[i] = LoadDataset_General.Get_root_word(data[i])

data[0][2] = LoadDataset_General.Emoticon_detection(data[0][2])