Ejemplo n.º 1
0
 def __Load(self):
     #carico le liste di dati
     pos=SaveLoad.LoadLines(self.__folder+"SentiPos.txt")
     neg=SaveLoad.LoadLines(self.__folder+"SentiNeg.txt")
             
     self.dictsentiScore={}
     
     self.sentiPos=list()
     for word in pos:
         word=word.strip()
         word=self.__PosTagger.PosTag(word)
         if word!=list():            
             word=self.__Stem.StemWord(word[0])
             self.dictsentiScore[word[0]]=float(1)
             self.sentiPos.append(word[0])
         
             
     self.sentiNeg=list()
     for word in neg:
         word=word.strip()
         word=self.__PosTagger.PosTag(word)
         if word!=list():
             word=self.__Stem.StemWord(word[0])
             self.dictsentiScore[word[0]]=float(-1)
             self.sentiNeg.append(word[0])
     
     self.__stopWords=SaveLoad.LoadByte(self.__stopWordsFilename)
     if not self.__stopWords:
         print "file stopwords mancante...\nestrazione in corso"
         paisaWordsDataExtractor.paisaWordsDataExtractor()
  
         if not self.__Load():
             print "processo di estrazione corrotto\nimpossibile procedere..."
             return False
     return True
Ejemplo n.º 2
0
    def __ElaboraFiles(self):
        #elabora tutti i file

        #le frasi con solo i tag
        self.tagsent = list()
        #le frasi con solo le words
        self.sentsent = list()

        i = float(1)
        #files=glob.glob(self.folderDialoghiPosTag+u'*')
        files = ["C:\\Esame TTR\\tutti i dialoghi.txt"]
        tot = len(files)

        print tot
        for fil in files:
            print "elaborazione file ", fil[len(self.folderDialoghiPosTag
                                                ):], " - ", i, " / ", tot

            #to del self.
            #self.dlg=SaveLoad.LoadByte(fil)
            self.dlg = SaveLoad.LoadLines(fil)
            if self.dlg:
                for j in xrange(len(self.dlg) -
                                1):  #-1 perchè non devo fare l'ultima!
                    k = self.getSent(self.dlg[j])
                    v = self.getSent(self.dlg[j + 1])
                    k = u" ".join(k)
                    k = k.strip()

                    v = u" ".join(v)
                    v = v.strip()

                    self.sentsentSuccessiva[k].append(v)
            i += 1
Ejemplo n.º 3
0
    def allPosTag(self):
        i = float(1)
        tot = len(glob.glob(self.folderDialoghiClean + '*'))
        ifile = len(glob.glob(self.folderDialoghiPosTag + '*'))

        for fil in glob.glob(self.folderDialoghiClean + u'*'):
            try:
                dlg = []
                for line in SaveLoad.LoadLines(fil):
                    if len(line.strip()) > 0:
                        line = unicode(line)
                        line = line.lower()
                        l = self.pos.PosTag(line)
                        dlg.append(l)

                #salvo il file postaggato e senza \n inutili
                filename = str(ifile) + u'.txt'
                filename = self.folderDialoghiPosTag + filename
                #salvo il file  dei dialoghi
                if SaveLoad.SaveByte(dlg, filename):
                    ifile += 1

            except:
                #potrebbero esserci errori di qualsiasi natura nei file, in questo caso ignoro il file e passo al successivo
                pass

            print "elaborazione file ", fil[len(self.folderDialoghiClean
                                                ):], " - ", i, " / ", tot
            i += 1
Ejemplo n.º 4
0
    def AvviaCreazioneDati(self):
        sentWithSentimento = list()

        i = 0
        files = glob.glob(self.__folderdialoghi + '*')
        for file in files:

            print file

            dlg = SaveLoad.LoadLines(file)
            for line in dlg:
                line = self.SentiAnaliz.StimaSentimentoFrase(line)
                if line != list():
                    for l in line:
                        if l != list():
                            sentWithSentimento.append(l)


#            print 'file', i
            i += 1

            if i > 50:
                return sentWithSentimento

        return sentWithSentimento
Ejemplo n.º 5
0
 def SentimentiMostFreq(self):        
     lines=SaveLoad.LoadLines(self.sentimentiMostFreqFilename)
     for line in lines:
         try:
             print line
             keyRicerca=line.strip()
             _=YahooAnswer.YahooAnswer(answer=keyRicerca, ordinamento='rillevanza', numeroRisultati=3)
         except:
             pass
Ejemplo n.º 6
0
    def __ElaboraFile(self, file):
        dlg = SaveLoad.LoadLines(file)
        if dlg:
            for j in xrange(len(dlg) - 1):  #-1 perchè non devo fare l'ultima!
                k = self.getSent(dlg[j])
                v = self.getSent(dlg[j + 1])
                k = u"".join(k)
                k = k.strip()

                v = u"".join(v)
                v = v.strip()

                self.sentsentSuccessiva[k].append(v)
Ejemplo n.º 7
0
    def __init__(self, dlgname):
        self.extdlg = '.txt'
        self.extdlgData = ".dat"
        self.extdlgVoc = ".voc"
        self.extdictWordScoreRow = '.dictWordsScoreRows'

        self.folder = "risorse\\Dati\\dialoghi\\"

        dlg = SaveLoad.LoadLines(self.folder + "dialoghiRaw\\" + dlgname +
                                 self.extdlg)

        if dlg:
            self.__Tfidf(dlg)
            self.__Save(dlgname)
        else:
            print "file di dialogo mancante\nimpossibile procedere..."
Ejemplo n.º 8
0
    def __CaricaDlg(self, dlgname):
        self.dlg = SaveLoad.LoadLines(self.folder + "\\dialoghiRaw\\" +
                                      dlgname + self.extdlg)
        self.dictWordScoreRow = SaveLoad.LoadByte(self.folder + dlgname +
                                                  self.extdictWordScoreRow)
        self.dictWordIndex = SaveLoad.LoadByte(self.folder + dlgname +
                                               self.extdictWordIndex)

        if not (self.dictWordScoreRow and self.dictWordIndex):
            print "file dict mancanti\ncreazione in corso..."
            CreaDatiQuestEngine.CreaDatiQuestEngine(dlgname)
            #ricarico i dati
            self.dictWordScoreRoww = SaveLoad.LoadByte(
                self.folder + dlgname + self.extdictWordScoreRow)
            self.dictWordIndex = SaveLoad.LoadByte(self.folder + dlgname +
                                                   self.extdictWordIndex)
Ejemplo n.º 9
0
    def __Train(self):
        try:
            abl = SaveLoad.LoadLines(self.__abbrvFilename)
            train = self.__corpus.tagged_sents()[:]

            if abl != False:

                punkt_param = nltk.tokenize.punkt.PunktParameters()
                punkt_param.abbrev_types = abl

                self.__sentsTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(
                    punkt_param)

                self.__sentsTokenizer.train(train)

                return True
            else:
                self.__DefaultSentTokenize()

        except:
            #carico il tokenizer di default
            self.__DefaultSentTokenize()
Ejemplo n.º 10
0
    def __CreaCorpus(self, dlgfilename):
        all_sents = SaveLoad.LoadLines(dlgfilename)
        corpus, vocab = self.__corpus2vectors(all_sents)

        return corpus, vocab
Ejemplo n.º 11
0
__folder = "risorse\\Dati\\"
__stopWordsFilename = __folder + "StopWords.list"
__stopWords = list()

__pulisci = PulisciSent.PulisciSent()

__sentTokenize = SentenceTokenize.SentsTokenize()
__PosTagger = PosTagger_light.PosTagger()
__Parse = ParseIt.ParseIt()
__Stem = StemmerIt.StemmerIt()

tagtoexclude = [u"PUNC", u"SYM", u"NUM", u"ART", u"PRE"]
WordsInvertiSent = [u"no", u"non"]

#carico le liste di dati
pos = SaveLoad.LoadLines(__folder + "SentiPos.txt")
neg = SaveLoad.LoadLines(__folder + "SentiNeg.txt")

dictsentiScore = {word.strip(): float(1) for word in pos}
dictsentiScore.update({word.strip(): float(-1) for word in neg})

sentiPos = [word.strip() for word in pos]
sentiNeg = [word.strip() for word in neg]


def Analisi(sents):
    """
        
        Analisi frase
    
        input: string
Ejemplo n.º 12
0
    def __init__(self, dlgname):
        self.extdlg = ".txt"
        self.extdlgData = ".dat"
        self.extdlgVoc = ".voc"
        self.extdictWordIndex = '.dictWordIndex'
        self.extdictWordScoreRow = '.dictWordsScoreRows'

        self.folder = "risorse\\Dati\\dialoghi\\"

        dlg = SaveLoad.LoadLines(self.folder + "\\dialoghiRaw\\" + dlgname +
                                 self.extdlg)
        #dlgfilename="C:\\Esame TTR\\risorse\\Dati\\dialoghi\\dialoghiRaw\\Black Mirror  dvdrip.txt"

        dlgdata = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgData)
        vocab = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgVoc)

        try:
            dlgdata.mean()
        except:
            print "file .dat mancante\ncreazione in corso..."
            #creo il file con tdidft cosine ecc...
            tfidfDlgCreator.tfidfDataCreator(dlgname)

            #ricarico i dati
            dlgdata = SaveLoad.LoadByte(self.folder + dlgname +
                                        self.extdlgData)
            vocab = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgVoc)

        dictIndexWord = dict()
        for i in xrange(len(vocab)):
            dictIndexWord[i] = vocab[i]

        #dictPosWordsReverse
        self.dictWordIndex = dict()
        #[word]=index in dictIndexWord
        for k, v in dictIndexWord.iteritems():
            self.dictWordIndex[v] = k

        self.dictWordScoreRow = collections.defaultdict(list)
        #[word][score in row][Row] -> per max

        #        print dlgdata.shape[0]
        #        print dlgdata.shape[1]

        for row in xrange(dlgdata.shape[0]):
            for col in xrange(dlgdata.shape[1]):
                indice = tuple([row, col])
                prob = dlgdata[indice]
                if prob != float(0):
                    #                    print row, col, prob

                    scoreRow = dict()

                    word = dictIndexWord[col]
                    count = dlg[row].split()
                    count = count.count(word)
                    count = count * prob  #dlgdata[row][col]

                    scoreRow[count] = row
                    self.dictWordScoreRow[word].append(scoreRow)

        del dlgdata

        print "file Saved:", SaveLoad.SaveByte(
            self.dictWordScoreRow,
            self.folder + dlgname + self.extdictWordScoreRow)
        print "file Saved:", SaveLoad.SaveByte(
            self.dictWordIndex, self.folder + dlgname + self.extdictWordIndex)
Ejemplo n.º 13
0
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 10 05:38:55 2015

@author: Patrizio
"""

import SaveLoad

import re

print 'sistema tutti i dialoghi k'
#funzione per non correggere la classe e rifare tutti i calcoli

dlg = SaveLoad.LoadLines('tutti i dialoghi k.txt')

i = 0
for line in dlg:

    line = re.sub('(\s){2,}', u"#", line)
    line = re.sub('(\s){,}', u"", line)
    line = re.sub('(#)', u" ", line)

    print line

    i += 1

    if line > 3:
        break
Ejemplo n.º 14
0
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import SaveLoad

import AnalizzatoreSents
import time

from sklearn.feature_extraction.text import TfidfVectorizer

filename = "C:\\Esame TTR\\risorse\\Dati\\dialoghi\\dialoghiRaw\\Burn_Notice.txt"

dlg = SaveLoad.LoadLines(filename)

#analiz=AnalizzatoreSents.AnalizzatoreSents()

print time.asctime()
print 'creo copia in analiz'
#copia per analiz
dlgcopy = dlg  #analiz.Analisi(dlg)
print time.asctime()

tfidf = TfidfVectorizer()
print time.asctime()
print 'creo tfidf'
tfidfdata = tfidf.fit_transform(dlgcopy)
vocab = tfidf.get_feature_names()
print type(vocab)