def __Load(self): #carico le liste di dati pos=SaveLoad.LoadLines(self.__folder+"SentiPos.txt") neg=SaveLoad.LoadLines(self.__folder+"SentiNeg.txt") self.dictsentiScore={} self.sentiPos=list() for word in pos: word=word.strip() word=self.__PosTagger.PosTag(word) if word!=list(): word=self.__Stem.StemWord(word[0]) self.dictsentiScore[word[0]]=float(1) self.sentiPos.append(word[0]) self.sentiNeg=list() for word in neg: word=word.strip() word=self.__PosTagger.PosTag(word) if word!=list(): word=self.__Stem.StemWord(word[0]) self.dictsentiScore[word[0]]=float(-1) self.sentiNeg.append(word[0]) self.__stopWords=SaveLoad.LoadByte(self.__stopWordsFilename) if not self.__stopWords: print "file stopwords mancante...\nestrazione in corso" paisaWordsDataExtractor.paisaWordsDataExtractor() if not self.__Load(): print "processo di estrazione corrotto\nimpossibile procedere..." return False return True
def __ElaboraFiles(self): #elabora tutti i file #le frasi con solo i tag self.tagsent = list() #le frasi con solo le words self.sentsent = list() i = float(1) #files=glob.glob(self.folderDialoghiPosTag+u'*') files = ["C:\\Esame TTR\\tutti i dialoghi.txt"] tot = len(files) print tot for fil in files: print "elaborazione file ", fil[len(self.folderDialoghiPosTag ):], " - ", i, " / ", tot #to del self. #self.dlg=SaveLoad.LoadByte(fil) self.dlg = SaveLoad.LoadLines(fil) if self.dlg: for j in xrange(len(self.dlg) - 1): #-1 perchè non devo fare l'ultima! k = self.getSent(self.dlg[j]) v = self.getSent(self.dlg[j + 1]) k = u" ".join(k) k = k.strip() v = u" ".join(v) v = v.strip() self.sentsentSuccessiva[k].append(v) i += 1
def allPosTag(self): i = float(1) tot = len(glob.glob(self.folderDialoghiClean + '*')) ifile = len(glob.glob(self.folderDialoghiPosTag + '*')) for fil in glob.glob(self.folderDialoghiClean + u'*'): try: dlg = [] for line in SaveLoad.LoadLines(fil): if len(line.strip()) > 0: line = unicode(line) line = line.lower() l = self.pos.PosTag(line) dlg.append(l) #salvo il file postaggato e senza \n inutili filename = str(ifile) + u'.txt' filename = self.folderDialoghiPosTag + filename #salvo il file dei dialoghi if SaveLoad.SaveByte(dlg, filename): ifile += 1 except: #potrebbero esserci errori di qualsiasi natura nei file, in questo caso ignoro il file e passo al successivo pass print "elaborazione file ", fil[len(self.folderDialoghiClean ):], " - ", i, " / ", tot i += 1
def AvviaCreazioneDati(self): sentWithSentimento = list() i = 0 files = glob.glob(self.__folderdialoghi + '*') for file in files: print file dlg = SaveLoad.LoadLines(file) for line in dlg: line = self.SentiAnaliz.StimaSentimentoFrase(line) if line != list(): for l in line: if l != list(): sentWithSentimento.append(l) # print 'file', i i += 1 if i > 50: return sentWithSentimento return sentWithSentimento
def SentimentiMostFreq(self): lines=SaveLoad.LoadLines(self.sentimentiMostFreqFilename) for line in lines: try: print line keyRicerca=line.strip() _=YahooAnswer.YahooAnswer(answer=keyRicerca, ordinamento='rillevanza', numeroRisultati=3) except: pass
def __ElaboraFile(self, file): dlg = SaveLoad.LoadLines(file) if dlg: for j in xrange(len(dlg) - 1): #-1 perchè non devo fare l'ultima! k = self.getSent(dlg[j]) v = self.getSent(dlg[j + 1]) k = u"".join(k) k = k.strip() v = u"".join(v) v = v.strip() self.sentsentSuccessiva[k].append(v)
def __init__(self, dlgname): self.extdlg = '.txt' self.extdlgData = ".dat" self.extdlgVoc = ".voc" self.extdictWordScoreRow = '.dictWordsScoreRows' self.folder = "risorse\\Dati\\dialoghi\\" dlg = SaveLoad.LoadLines(self.folder + "dialoghiRaw\\" + dlgname + self.extdlg) if dlg: self.__Tfidf(dlg) self.__Save(dlgname) else: print "file di dialogo mancante\nimpossibile procedere..."
def __CaricaDlg(self, dlgname): self.dlg = SaveLoad.LoadLines(self.folder + "\\dialoghiRaw\\" + dlgname + self.extdlg) self.dictWordScoreRow = SaveLoad.LoadByte(self.folder + dlgname + self.extdictWordScoreRow) self.dictWordIndex = SaveLoad.LoadByte(self.folder + dlgname + self.extdictWordIndex) if not (self.dictWordScoreRow and self.dictWordIndex): print "file dict mancanti\ncreazione in corso..." CreaDatiQuestEngine.CreaDatiQuestEngine(dlgname) #ricarico i dati self.dictWordScoreRoww = SaveLoad.LoadByte( self.folder + dlgname + self.extdictWordScoreRow) self.dictWordIndex = SaveLoad.LoadByte(self.folder + dlgname + self.extdictWordIndex)
def __Train(self): try: abl = SaveLoad.LoadLines(self.__abbrvFilename) train = self.__corpus.tagged_sents()[:] if abl != False: punkt_param = nltk.tokenize.punkt.PunktParameters() punkt_param.abbrev_types = abl self.__sentsTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer( punkt_param) self.__sentsTokenizer.train(train) return True else: self.__DefaultSentTokenize() except: #carico il tokenizer di default self.__DefaultSentTokenize()
def __CreaCorpus(self, dlgfilename): all_sents = SaveLoad.LoadLines(dlgfilename) corpus, vocab = self.__corpus2vectors(all_sents) return corpus, vocab
__folder = "risorse\\Dati\\" __stopWordsFilename = __folder + "StopWords.list" __stopWords = list() __pulisci = PulisciSent.PulisciSent() __sentTokenize = SentenceTokenize.SentsTokenize() __PosTagger = PosTagger_light.PosTagger() __Parse = ParseIt.ParseIt() __Stem = StemmerIt.StemmerIt() tagtoexclude = [u"PUNC", u"SYM", u"NUM", u"ART", u"PRE"] WordsInvertiSent = [u"no", u"non"] #carico le liste di dati pos = SaveLoad.LoadLines(__folder + "SentiPos.txt") neg = SaveLoad.LoadLines(__folder + "SentiNeg.txt") dictsentiScore = {word.strip(): float(1) for word in pos} dictsentiScore.update({word.strip(): float(-1) for word in neg}) sentiPos = [word.strip() for word in pos] sentiNeg = [word.strip() for word in neg] def Analisi(sents): """ Analisi frase input: string
def __init__(self, dlgname): self.extdlg = ".txt" self.extdlgData = ".dat" self.extdlgVoc = ".voc" self.extdictWordIndex = '.dictWordIndex' self.extdictWordScoreRow = '.dictWordsScoreRows' self.folder = "risorse\\Dati\\dialoghi\\" dlg = SaveLoad.LoadLines(self.folder + "\\dialoghiRaw\\" + dlgname + self.extdlg) #dlgfilename="C:\\Esame TTR\\risorse\\Dati\\dialoghi\\dialoghiRaw\\Black Mirror dvdrip.txt" dlgdata = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgData) vocab = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgVoc) try: dlgdata.mean() except: print "file .dat mancante\ncreazione in corso..." #creo il file con tdidft cosine ecc... tfidfDlgCreator.tfidfDataCreator(dlgname) #ricarico i dati dlgdata = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgData) vocab = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgVoc) dictIndexWord = dict() for i in xrange(len(vocab)): dictIndexWord[i] = vocab[i] #dictPosWordsReverse self.dictWordIndex = dict() #[word]=index in dictIndexWord for k, v in dictIndexWord.iteritems(): self.dictWordIndex[v] = k self.dictWordScoreRow = collections.defaultdict(list) #[word][score in row][Row] -> per max # print dlgdata.shape[0] # print dlgdata.shape[1] for row in xrange(dlgdata.shape[0]): for col in xrange(dlgdata.shape[1]): indice = tuple([row, col]) prob = dlgdata[indice] if prob != float(0): # print row, col, prob scoreRow = dict() word = dictIndexWord[col] count = dlg[row].split() count = count.count(word) count = count * prob #dlgdata[row][col] scoreRow[count] = row self.dictWordScoreRow[word].append(scoreRow) del dlgdata print "file Saved:", SaveLoad.SaveByte( self.dictWordScoreRow, self.folder + dlgname + self.extdictWordScoreRow) print "file Saved:", SaveLoad.SaveByte( self.dictWordIndex, self.folder + dlgname + self.extdictWordIndex)
# -*- coding: utf-8 -*- """ Created on Thu Sep 10 05:38:55 2015 @author: Patrizio """ import SaveLoad import re print 'sistema tutti i dialoghi k' #funzione per non correggere la classe e rifare tutti i calcoli dlg = SaveLoad.LoadLines('tutti i dialoghi k.txt') i = 0 for line in dlg: line = re.sub('(\s){2,}', u"#", line) line = re.sub('(\s){,}', u"", line) line = re.sub('(#)', u" ", line) print line i += 1 if line > 3: break
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ import SaveLoad import AnalizzatoreSents import time from sklearn.feature_extraction.text import TfidfVectorizer filename = "C:\\Esame TTR\\risorse\\Dati\\dialoghi\\dialoghiRaw\\Burn_Notice.txt" dlg = SaveLoad.LoadLines(filename) #analiz=AnalizzatoreSents.AnalizzatoreSents() print time.asctime() print 'creo copia in analiz' #copia per analiz dlgcopy = dlg #analiz.Analisi(dlg) print time.asctime() tfidf = TfidfVectorizer() print time.asctime() print 'creo tfidf' tfidfdata = tfidf.fit_transform(dlgcopy) vocab = tfidf.get_feature_names() print type(vocab)