def __Load(self): self.__unigram_tagger = SaveLoad.LoadByte( self.__unigram_taggerFileName) self.__bigram_tagger = SaveLoad.LoadByte(self.__bigram_taggerFileName) if self.__bigram_tagger != False and self.__unigram_tagger != False: return True return False
def __init__ (self): self.qta=155 self.__folder="risorse\\Dati\\" self.__foldertaggedsents =self.__folder+"training\\paisaTagged" self.wordsFreqFilename=self.__folder+"wordFreq.fdist" self.stopWordsFilename=self.__folder+"StopWords.list" self.folderDialoghiPosTag=self.__folder+"dialoghi\\postag\\" self.domandeMostFreqFilename=self.__folder+"domandeMostFreq.txt" self.sentimentiMostFreqFilename=self.__folder+"sentimentiMostFreq.txt" self.emozioniMostFreqFilename=self.__folder+"emozioniMostFreq.txt" self.wordsFreq=SaveLoad.LoadByte(self.wordsFreqFilename) # self.stopWords=SaveLoad.LoadByte(self.stopWords) self.noun=set() self.verb=set() self.SentimentiMostFreq() self.EmozioniMostFreq() #domande prese da internet come "in-cultura" popolare e keywords di google self.DomaneMostFreq() #keywords prese dai dialoghi e mischiate random! self.ElaboraDati() self.EstraiMostFreq() self.EstraiYahooAnswers(self.qta)
def Load(self): self.classificatoreSenti=SaveLoad.LoadByte(self.classificatoreSentiFilename) if self.classificatoreSenti: return True return False
def __Load(self): #carico le liste di dati pos=SaveLoad.LoadLines(self.__folder+"SentiPos.txt") neg=SaveLoad.LoadLines(self.__folder+"SentiNeg.txt") self.dictsentiScore={} self.sentiPos=list() for word in pos: word=word.strip() word=self.__PosTagger.PosTag(word) if word!=list(): word=self.__Stem.StemWord(word[0]) self.dictsentiScore[word[0]]=float(1) self.sentiPos.append(word[0]) self.sentiNeg=list() for word in neg: word=word.strip() word=self.__PosTagger.PosTag(word) if word!=list(): word=self.__Stem.StemWord(word[0]) self.dictsentiScore[word[0]]=float(-1) self.sentiNeg.append(word[0]) self.__stopWords=SaveLoad.LoadByte(self.__stopWordsFilename) if not self.__stopWords: print "file stopwords mancante...\nestrazione in corso" paisaWordsDataExtractor.paisaWordsDataExtractor() if not self.__Load(): print "processo di estrazione corrotto\nimpossibile procedere..." return False return True
def __CaricaDlg(self, dlgname): self.dlg = SaveLoad.LoadLines(self.folder + "\\dialoghiRaw\\" + dlgname + self.extdlg) self.dictWordScoreRow = SaveLoad.LoadByte(self.folder + dlgname + self.extdictWordScoreRow) self.dictWordIndex = SaveLoad.LoadByte(self.folder + dlgname + self.extdictWordIndex) if not (self.dictWordScoreRow and self.dictWordIndex): print "file dict mancanti\ncreazione in corso..." CreaDatiQuestEngine.CreaDatiQuestEngine(dlgname) #ricarico i dati self.dictWordScoreRoww = SaveLoad.LoadByte( self.folder + dlgname + self.extdictWordScoreRow) self.dictWordIndex = SaveLoad.LoadByte(self.folder + dlgname + self.extdictWordIndex)
def __Load(self): """ Load input: None hidden: carica i dati output: True se il processo termina correttamente """ self.__grammars = SaveLoad.LoadByte(self.__grammarsFilename) if not self.__grammars: import GrammarExtractor GrammarExtractor.GrammarExtractor() self.__grammars = SaveLoad.LoadByte(self.__grammarsFilename) return True
def __Load(): __stopWords = SaveLoad.LoadByte(__stopWordsFilename) if not __stopWords: print "file stopwords mancante...\nestrazione in corso" paisaWordsDataExtractor.paisaWordsDataExtractor() if not __Load(): print "processo di estrazione corrotto\nimpossibile procedere..." return False return True
def ElaboraDati(self): files=glob.glob(self.folderDialoghiPosTag+'*') tot=len(files) i=1 for file in files: dati=SaveLoad.LoadByte(file) for line in dati: for word in line: if word[1]==u"NOUN": self.noun.add(word[0]) elif word[1]==u"VER": self.verb.add(word[0]) print "Elaborato", i, "/",tot i+=1
def __Load(self): """ Load input: None hidden: carica il tokenize output: True se il processo termina correttamente """ self.__sentsTokenizer = SaveLoad.LoadByte(self.__sentsTokenFileName) if not self.__sentsTokenizer: self.__corpus = nltk.corpus.ConllCorpusReader( self.__foldertaggedsents, '.*', self.__lst_pos) return False return True
def __init__(self): self.__folder = "risorse\\Dati\\" self.stopWordFilename = self.__folder + "StopWords.list" self.stopWords = SaveLoad.LoadByte(self.stopWordFilename)
def __CreaClassificatore(self): dati=SaveLoad.LoadByte(self.__SentimTrainsetFilename) self.classificatoreSenti=self.SKClassifierSVM(dati) self.Save()
def Load(self): self.suffissiTag = SaveLoad.LoadByte(self.suffissiTagFilename) if self.suffissiTag: return True return False
def __init__(self, dlgname): self.extdlg = ".txt" self.extdlgData = ".dat" self.extdlgVoc = ".voc" self.extdictWordIndex = '.dictWordIndex' self.extdictWordScoreRow = '.dictWordsScoreRows' self.folder = "risorse\\Dati\\dialoghi\\" dlg = SaveLoad.LoadLines(self.folder + "\\dialoghiRaw\\" + dlgname + self.extdlg) #dlgfilename="C:\\Esame TTR\\risorse\\Dati\\dialoghi\\dialoghiRaw\\Black Mirror dvdrip.txt" dlgdata = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgData) vocab = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgVoc) try: dlgdata.mean() except: print "file .dat mancante\ncreazione in corso..." #creo il file con tdidft cosine ecc... tfidfDlgCreator.tfidfDataCreator(dlgname) #ricarico i dati dlgdata = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgData) vocab = SaveLoad.LoadByte(self.folder + dlgname + self.extdlgVoc) dictIndexWord = dict() for i in xrange(len(vocab)): dictIndexWord[i] = vocab[i] #dictPosWordsReverse self.dictWordIndex = dict() #[word]=index in dictIndexWord for k, v in dictIndexWord.iteritems(): self.dictWordIndex[v] = k self.dictWordScoreRow = collections.defaultdict(list) #[word][score in row][Row] -> per max # print dlgdata.shape[0] # print dlgdata.shape[1] for row in xrange(dlgdata.shape[0]): for col in xrange(dlgdata.shape[1]): indice = tuple([row, col]) prob = dlgdata[indice] if prob != float(0): # print row, col, prob scoreRow = dict() word = dictIndexWord[col] count = dlg[row].split() count = count.count(word) count = count * prob #dlgdata[row][col] scoreRow[count] = row self.dictWordScoreRow[word].append(scoreRow) del dlgdata print "file Saved:", SaveLoad.SaveByte( self.dictWordScoreRow, self.folder + dlgname + self.extdictWordScoreRow) print "file Saved:", SaveLoad.SaveByte( self.dictWordIndex, self.folder + dlgname + self.extdictWordIndex)
for dato in dati: if len(dato[0]) > 3: i = dato[0][:2] fs = dato[0][-1:] fd = dato[0][-2:] ft = dato[0][-3:] feature = {'i': i, 'fs': fs, 'fd': fd, 'ft': ft} features.append([feature, dato[1]]) return features def Feature(dato): i = dato[:2] fs = dato[-1:] fd = dato[-2:] ft = dato[-3:] return {'i': i, 'fs': fs, 'fd': fd, 'ft': ft} import SaveLoad svm = SaveLoad.LoadByte("SVM_mf.tmp") dt = SaveLoad.LoadByte("NLTK_DTP_mf.tmp") word = 'caschereste' w = Feature(word) print svm.classify(w)
def __Load(self): self.parole = SaveLoad.LoadByte(self.paroleFilename) print len(self.parole) self.parole = sorted({i for i in self.parole}) print len(self.parole)