Beispiel #1
0
import ngramGenerator
import preprocessing


KERNEL_FUNCTION='linear'
C_PARAMETER=0.6

print "Initializing dictionnaries"
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
afinn=polarity.loadAfinn('../resources/afinn.txt')
#sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv')
emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt")

print "Bulding Bag of words ..."
positive=ngramGenerator.mostFreqList('../data/used/positive1.csv',3000)
negative=ngramGenerator.mostFreqList('../data/used/negative1.csv',3000)
neutral=ngramGenerator.mostFreqList('../data/used/neutral1.csv',3000)


for w in positive:
    if w in negative+neutral : 
        positive.remove(w)

for w in negative:
    if w in positive+neutral : 
        negative.remove(w)

for w in neutral:
    if w in negative+positive : 
        neutral.remove(w)
Beispiel #2
0
import preprocessing

# User input for model parameters
N_NEIGHBORS = 10  # number of neighbors for KNN
KERNEL_FUNCTION = 'linear'  # kernel function for SVM
C_PARAMETER = 0.2
UNIGRAM_SIZE = 3000

print "Initializing dictionnaries"
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
afinn = polarity.loadAfinn('../resources/afinn.txt')
emoticonDict = features.createEmoticonDictionary("../resources/emoticon.txt")

print "Bulding unigram vector"
positive = ngramGenerator.mostFreqList('../data/used/positive1.csv',
                                       UNIGRAM_SIZE)  # add as needed
negative = ngramGenerator.mostFreqList('../data/used/negative1.csv',
                                       UNIGRAM_SIZE)
neutral = ngramGenerator.mostFreqList('../data/used/neutral1.csv',
                                      UNIGRAM_SIZE)

for w in positive:
    if w in negative + neutral:
        positive.remove(w)

for w in negative:
    if w in positive + neutral:
        negative.remove(w)

for w in neutral:
    if w in negative + positive:
Beispiel #3
0
import preprocessing
import ngramGenerator
import polarity
import features

stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
sentiWordnet=polarity.loadSentiFull('../resources/sentiWordnetBig.csv')
emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt")

# do the preprocessing here and 3 output files
# done in the threeFileGen script 

# define here lists of unigram for each file , 3 lists
pos=ngramGenerator.mostFreqList('../data/positive_processed.csv',2)
positive=[w[0] for w in pos]
neg=ngramGenerator.mostFreqList('../data/negative_processed.csv',2)
negative=[w[0] for w in neg]
neu=ngramGenerator.mostFreqList('../data/neutral_processed.csv',2)
neutral=[w[0] for w in neu]

total=positive+negative+neutral # total unigram vector
#print len(total)


# prepare mapping function

def mapper(filename,label):
#    k=0
    f=open(filename,'r')
    line=f.readline()
Beispiel #4
0
import features
import polarity
import ngramGenerator
import preprocessing

#WEUGHTS_VECTOR=[1.0,1.0,0.6]

print "Initializing dictionnaries"
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
afinn = polarity.loadAfinn('../resources/afinn.txt')
#sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv')
emoticonDict = features.createEmoticonDictionary("../resources/emoticon.txt")

print "Bulding unigram vector"
positive = ngramGenerator.mostFreqList('../data/used/positive1.csv', 3000)
negative = ngramGenerator.mostFreqList('../data/used/negative1.csv', 3000)
neutral = ngramGenerator.mostFreqList('../data/used/neutral1.csv', 3000)

total = positive + negative + neutral  # total unigram vector
for w in total:
    count = total.count(w)
    if (count > 1):
        while (count > 0):
            count = count - 1
            total.remove(w)
# equalize unigrams sizes
m = min([len(positive), len(negative), len(neutral)])

positive = positive[0:m - 1]
negative = negative[0:m - 1]
Beispiel #5
0
# User input for model parameters
N_NEIGHBORS=10  # number of neighbors for KNN
KERNEL_FUNCTION='linear'  # kernel function for SVM
C_PARAMETER=0.2
UNIGRAM_SIZE=3000



print "Initializing dictionnaries"
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
afinn=polarity.loadAfinn('../resources/afinn.txt')
emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt")

print "Bulding unigram vector"
positive=ngramGenerator.mostFreqList('../data/used/positive1.csv',UNIGRAM_SIZE) # add as needed 
negative=ngramGenerator.mostFreqList('../data/used/negative1.csv',UNIGRAM_SIZE)
neutral=ngramGenerator.mostFreqList('../data/used/neutral1.csv',UNIGRAM_SIZE)


for w in positive:
    if w in negative+neutral : 
        positive.remove(w)

for w in negative:
    if w in positive+neutral : 
        negative.remove(w)

for w in neutral:
    if w in negative+positive : 
        neutral.remove(w)
Beispiel #6
0
import polarity
import ngramGenerator
import preprocessing


#WEUGHTS_VECTOR=[1.0,1.0,0.6]

print "Initializing dictionnaries"
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
#sentiWordnet=polarity.loadSentiFull('../resources/sentiWordnetBig.csv')
sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv')
emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt")

print "Bulding 150 unigram vector"
positive=ngramGenerator.mostFreqList('../data/positive_processed.csv',1000)
negative=ngramGenerator.mostFreqList('../data/negative_processed.csv',1000)
neutral=ngramGenerator.mostFreqList('../data/neutral_processed.csv',1000)


total=positive+negative+neutral # total unigram vector
for w in total:
    count=total.count(w)
    if (count > 1):
        while (count>0):
            count=count-1
            total.remove(w)


 
def mapTweet(tweet,sentiWordnet,emoDict,unigram,slangs):
    z_scaled = scaler.transform([z])
    z = normalizer.transform(z_scaled)
    z = z[0].tolist()
    return model.predict([z]).tolist()


# Preprocesamiento de los archivos
stopWords = preprocessing.getStopWordList(
    abs_file_url('resources/stopWords.txt'))
slangs = preprocessing.loadSlangs(abs_file_url('resources/internetSlangs.txt'))
afinn = polarity.loadAfinn(abs_file_url('resources/afinn.txt'))
emoticonDict = features.createEmoticonDictionary(
    abs_file_url('resources/emoticon.txt'))

# Se construye el vector con las palabras más frecuentes presentes en tweets positivos, negativos, y neutrales
positive = ngramGenerator.mostFreqList(abs_file_url('data/used/positive1.csv'),
                                       3000)
negative = ngramGenerator.mostFreqList(abs_file_url('data/used/negative1.csv'),
                                       3000)
neutral = ngramGenerator.mostFreqList(abs_file_url('data/used/neutral1.csv'),
                                      3000)

# Normalizamos el tamaño de los unigramas, si es que son menores a 3000
min_len = min([len(positive), len(negative), len(neutral)])

positive = positive[0:min_len]
negative = negative[0:min_len]
neutral = neutral[0:min_len]

# Cargamos los tweets de entrenamiento
# 4 = positivo
# 2 = neutral