import ngramGenerator import preprocessing KERNEL_FUNCTION='linear' C_PARAMETER=0.6 print "Initializing dictionnaries" stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') afinn=polarity.loadAfinn('../resources/afinn.txt') #sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv') emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt") print "Bulding Bag of words ..." positive=ngramGenerator.mostFreqList('../data/used/positive1.csv',3000) negative=ngramGenerator.mostFreqList('../data/used/negative1.csv',3000) neutral=ngramGenerator.mostFreqList('../data/used/neutral1.csv',3000) for w in positive: if w in negative+neutral : positive.remove(w) for w in negative: if w in positive+neutral : negative.remove(w) for w in neutral: if w in negative+positive : neutral.remove(w)
import preprocessing # User input for model parameters N_NEIGHBORS = 10 # number of neighbors for KNN KERNEL_FUNCTION = 'linear' # kernel function for SVM C_PARAMETER = 0.2 UNIGRAM_SIZE = 3000 print "Initializing dictionnaries" stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') afinn = polarity.loadAfinn('../resources/afinn.txt') emoticonDict = features.createEmoticonDictionary("../resources/emoticon.txt") print "Bulding unigram vector" positive = ngramGenerator.mostFreqList('../data/used/positive1.csv', UNIGRAM_SIZE) # add as needed negative = ngramGenerator.mostFreqList('../data/used/negative1.csv', UNIGRAM_SIZE) neutral = ngramGenerator.mostFreqList('../data/used/neutral1.csv', UNIGRAM_SIZE) for w in positive: if w in negative + neutral: positive.remove(w) for w in negative: if w in positive + neutral: negative.remove(w) for w in neutral: if w in negative + positive:
import preprocessing import ngramGenerator import polarity import features stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') sentiWordnet=polarity.loadSentiFull('../resources/sentiWordnetBig.csv') emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt") # do the preprocessing here and 3 output files # done in the threeFileGen script # define here lists of unigram for each file , 3 lists pos=ngramGenerator.mostFreqList('../data/positive_processed.csv',2) positive=[w[0] for w in pos] neg=ngramGenerator.mostFreqList('../data/negative_processed.csv',2) negative=[w[0] for w in neg] neu=ngramGenerator.mostFreqList('../data/neutral_processed.csv',2) neutral=[w[0] for w in neu] total=positive+negative+neutral # total unigram vector #print len(total) # prepare mapping function def mapper(filename,label): # k=0 f=open(filename,'r') line=f.readline()
import features import polarity import ngramGenerator import preprocessing #WEUGHTS_VECTOR=[1.0,1.0,0.6] print "Initializing dictionnaries" stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') afinn = polarity.loadAfinn('../resources/afinn.txt') #sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv') emoticonDict = features.createEmoticonDictionary("../resources/emoticon.txt") print "Bulding unigram vector" positive = ngramGenerator.mostFreqList('../data/used/positive1.csv', 3000) negative = ngramGenerator.mostFreqList('../data/used/negative1.csv', 3000) neutral = ngramGenerator.mostFreqList('../data/used/neutral1.csv', 3000) total = positive + negative + neutral # total unigram vector for w in total: count = total.count(w) if (count > 1): while (count > 0): count = count - 1 total.remove(w) # equalize unigrams sizes m = min([len(positive), len(negative), len(neutral)]) positive = positive[0:m - 1] negative = negative[0:m - 1]
# User input for model parameters N_NEIGHBORS=10 # number of neighbors for KNN KERNEL_FUNCTION='linear' # kernel function for SVM C_PARAMETER=0.2 UNIGRAM_SIZE=3000 print "Initializing dictionnaries" stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') afinn=polarity.loadAfinn('../resources/afinn.txt') emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt") print "Bulding unigram vector" positive=ngramGenerator.mostFreqList('../data/used/positive1.csv',UNIGRAM_SIZE) # add as needed negative=ngramGenerator.mostFreqList('../data/used/negative1.csv',UNIGRAM_SIZE) neutral=ngramGenerator.mostFreqList('../data/used/neutral1.csv',UNIGRAM_SIZE) for w in positive: if w in negative+neutral : positive.remove(w) for w in negative: if w in positive+neutral : negative.remove(w) for w in neutral: if w in negative+positive : neutral.remove(w)
import polarity import ngramGenerator import preprocessing #WEUGHTS_VECTOR=[1.0,1.0,0.6] print "Initializing dictionnaries" stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') #sentiWordnet=polarity.loadSentiFull('../resources/sentiWordnetBig.csv') sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv') emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt") print "Bulding 150 unigram vector" positive=ngramGenerator.mostFreqList('../data/positive_processed.csv',1000) negative=ngramGenerator.mostFreqList('../data/negative_processed.csv',1000) neutral=ngramGenerator.mostFreqList('../data/neutral_processed.csv',1000) total=positive+negative+neutral # total unigram vector for w in total: count=total.count(w) if (count > 1): while (count>0): count=count-1 total.remove(w) def mapTweet(tweet,sentiWordnet,emoDict,unigram,slangs):
z_scaled = scaler.transform([z]) z = normalizer.transform(z_scaled) z = z[0].tolist() return model.predict([z]).tolist() # Preprocesamiento de los archivos stopWords = preprocessing.getStopWordList( abs_file_url('resources/stopWords.txt')) slangs = preprocessing.loadSlangs(abs_file_url('resources/internetSlangs.txt')) afinn = polarity.loadAfinn(abs_file_url('resources/afinn.txt')) emoticonDict = features.createEmoticonDictionary( abs_file_url('resources/emoticon.txt')) # Se construye el vector con las palabras más frecuentes presentes en tweets positivos, negativos, y neutrales positive = ngramGenerator.mostFreqList(abs_file_url('data/used/positive1.csv'), 3000) negative = ngramGenerator.mostFreqList(abs_file_url('data/used/negative1.csv'), 3000) neutral = ngramGenerator.mostFreqList(abs_file_url('data/used/neutral1.csv'), 3000) # Normalizamos el tamaño de los unigramas, si es que son menores a 3000 min_len = min([len(positive), len(negative), len(neutral)]) positive = positive[0:min_len] negative = negative[0:min_len] neutral = neutral[0:min_len] # Cargamos los tweets de entrenamiento # 4 = positivo # 2 = neutral