def ChargementMotsGrossier(): listeMotsGrossier = txtFileToListe( "Data/MotsGrossiers/MGFromWikipedia.txt") listeMotsGrossier += txtFileToListe( "Data/MotsGrossiers/MGSupplementaire.txt") global MotsGrossiers for motGrossier in listeMotsGrossier: motGrossierNormaliser = normalisationSimple(motGrossier) if len(motGrossierNormaliser.split()) == 1: MotsGrossiers[motGrossierNormaliser] = [] listeMGnormaliser = [] for motGrossier in MotsGrossiers.keys(): listeMGnormaliser.append(motGrossier + "\n") WriteInFile("Normalisation/MG.txt", listeMGnormaliser)
def DataPredictionToSMV(filename, index): svmFilename = "GenerationFichier/SVM/DT.svm" if os.path.isfile(svmFilename): os.remove(svmFilename) lignes = txtFileToListe("Normalisation/texteDT.txt") for line in lignes: lineToSVMfile(svmFilename, line, '0', index) return index
def TXTtoSVM(filename, svmFilename, index, labelFilename=None): if os.path.isfile(svmFilename): os.remove(svmFilename) print("Lancement de la transformation du fichier " + filename) lignes = txtFileToListe(filename) labels = [] if labelFilename == None: nbrLignes = len(lignes) labels = [ 0 ] * nbrLignes # https://www.geeksforgeeks.org/python-which-is-faster-to-initialize-lists/ else: labels = txtFileToListe(labelFilename) for i in range(len(labels)): lineToSVMfile(svmFilename, lignes[i], labels[i], index)
def ChargementMotsVide(): # Récupéré tous les fichiers d'un dossier : # https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory chemin = "Data/StopWord/" listeFichiers = [f for f in listdir(chemin) if isfile(join(chemin, f))] global ListeMotVide for fichier in listeFichiers: path = join(chemin, fichier) listeMotVide = txtFileToListe(path) for mot in listeMotVide: MVnormaliser = normalisationSimple(mot) if len(MVnormaliser.split()) == 1: ListeMotVide[MVnormaliser] = []
def GetPolarityByWord(): global WordPolarity dataWordPolarity = {} listePolarite = txtFileToListe("Normalisation/Polarite.txt") for polarite in listePolarite: polarite = polarite.replace("\n", '') data = polarite.split(";") mot = data[0] polariteMot = [int(data[1]), int(data[2]), int(data[3])] if dataWordPolarity.has_key(mot): polariteDejaExistante = dataWordPolarity[mot] dataWordPolarity[mot] = [ polariteDejaExistante[0] + polariteMot[0], polariteDejaExistante[1] + polariteMot[1], polariteDejaExistante[2] + polariteMot[2] ] else: dataWordPolarity[mot] = polariteMot listeMotGrossier = txtFileToListe("Normalisation/MG.txt") for grossiertee in listeMotGrossier: WordPolarity[grossiertee] = [0, 0, 1] for mot in dataWordPolarity.keys(): WordPolarity[mot] = PolarityFromValues(dataWordPolarity[mot], mot)
def Normalise_DonneeTest(TypeNormalisation="Simple"): fileName = "Data/donneeTest.txt" print("Lancement de la normalisation de " + fileName) lignesSource = txtFileToListe(fileName) lignes = [] ids = [] for ligneSource in lignesSource: ligne = "" texte = ligneSource[19:].replace('\n', '') id = ligneSource[:18] + "\n" message = normalisation(texte, TypeNormalisation) mots = message.split() for mot in mots: ligne += mot + " " lignes.append(ligne + "\n") ids.append(id) WriteInFile("Normalisation/idsDT.txt", ids) WriteInFile("Normalisation/texteDT.txt", lignes)
def GetModelFromDocToVec(filenames, pcTrain, vector_size, epochs, testing=False): docs = [] for filename in filenames: print("Lectures des données sur le fichier " + filename) docs += txtFileToListe(filename, withSpaceTreatment=True) nbrTrain = int((float(len(docs)) / 100) * pcTrain) trainData = CorpusToDocAndToken(docs[:nbrTrain]) testData = CorpusToDocAndToken(docs[nbrTrain:], tokenOnly=True) print("Taille du corpus d'apprentissage : " + str(len(trainData))) print("Taille du corpus de teste : " + str(len(testData))) print("Apprentissage du model") model = Doc2Vec(vector_size=vector_size, min_count=1, epochs=epochs) model.build_vocab(trainData) model.train(trainData, total_examples=model.corpus_count, epochs=model.epochs) if testing: Testing(testData, trainData, model) return model
def UnlabeledTweetToPolarity(): global WordPolarity listeTweet = txtFileToListe("Normalisation/70kTweet.txt") nbrBad = 0 nbrCool = 0 nbrNormal = 0 nbrMixte = 0 listTweetLabelliser = [] listTweetLabel = [] for tweet in listeTweet: tweet.replace("\n", '') polarite = [0, 0, 0] for mot in tweet.split(): if WordPolarity.has_key(mot): polarite = [ polarite[0] + WordPolarity[mot][0], polarite[1] + WordPolarity[mot][1], polarite[2] + WordPolarity[mot][2] ] if polarite[2] > 2: #print("Mechant : " + str(polarite) + " || " + tweet) listTweetLabelliser.append(tweet + "\n") listTweetLabel.append("3" + "\n") nbrBad += 1 elif polarite[0] > 5 and polarite[2] == 0: #print("Gentil : " + str(polarite) + " || " + tweet) listTweetLabelliser.append(tweet + "\n") listTweetLabel.append("2" + "\n") nbrCool += 1 elif polarite[0] == 0 and polarite[2] == 0 and polarite[1] > 3: print("Neutre : " + str(polarite) + " || " + tweet) listTweetLabelliser.append(tweet + "\n") listTweetLabel.append("0" + "\n") nbrNormal += 1 #WriteInFile("Normalisation/SVMunlabeled.txt", listTweetLabelliser) #WriteInFile("Normalisation/SVMunlabeledlabel.txt", listTweetLabel) print("Bad = " + str(nbrBad) + "| Cool = " + str(nbrCool) + "| neutre = " + str(nbrNormal) + "| mixte = " + str(nbrMixte))
def EnleverMotVideSpecifique(fichier): pathFile = "Normalisation/" + fichier + ".txt" index = IndexBuilder() lignes = txtFileToListe(pathFile, withSpaceTreatment=True) for ligne in lignes: for mot in ligne.split(): index.AddElem(mot) nbrOccurences = index.GetNombreOccurence() pc10 = int(float(len(lignes)) / 100 * 10) motVideSpecifique = {} for nbrOccurence in nbrOccurences: occurence = nbrOccurence[0] nbr = nbrOccurence[1][0] if nbr > pc10: motVideSpecifique[occurence] = 0 res = "" for ligne in lignes: for mot in ligne.split(): if not motVideSpecifique.has_key(mot): res += mot + " " res += "\n" print(motVideSpecifique) WriteInFile(pathFile, res)
def RNNscript(fichierEntrainement, predictFile, pourcentageTrain, shapeRNN=[200,150,100], nameModel="RNNmodel", typeRNN="FFW", TailleDuComitee=0, activationType='relu', batch_size=32, epochs=10, SelectBestModel=False, FMaxValue=0): global listesFichiersEntrainement, tailleVecteurDocToVec, pcTrain, epochsDocToVec, testingDocToVec, selectBestModel, LastResult selectBestModel = SelectBestModel if typeRNN == "Committee" and TailleDuComitee == 0: print("Nombre d'expert non indiqué, veuillez renseigner le parametre TailleDuComitee ") quit() docToVecModel = GetModelFromDocToVec(listesFichiersEntrainement, pcTrain, tailleVecteurDocToVec, epochsDocToVec, testing=testingDocToVec) docToVecModel.save("GenerationFichier/Models/DocToVec/" + nameModel + "_DocToVec") print("Lancement de la création du RNN") #except expression as identifier: fichierEntrainement = "Normalisation/" + fichierEntrainement + ".txt" fichierPredict = "Normalisation/" + predictFile + ".txt" fichierLabel = fichierEntrainement.replace(".txt", "label.txt") datas = txtFileToListe(fichierEntrainement, withSpaceTreatment=True) predictData = txtFileToListe(fichierPredict, withSpaceTreatment=True) labelDatas = txtFileToListe(fichierLabel, withSpaceTreatment=True) nbrTrain = int((float(len(datas)) / 100) * pourcentageTrain) trainData = datas[:nbrTrain] trainLabel = labelDatas[:nbrTrain] testData = datas[nbrTrain:] testLabel = labelDatas[nbrTrain:] vectorTrain = [vectorFromDataList(trainData, docToVecModel)] vectorToPredict = [vectorFromDataList(predictData, docToVecModel)] vectorTest = [vectorFromDataList(testData, docToVecModel)] trainLabel = np_utils.to_categorical(trainLabel, 4) testLabel = np_utils.to_categorical(testLabel, 4) print("Nombre de données d'entrainement : " + str(len(vectorTrain[0]))) print("Nombre de données de teste : " + str(len(vectorTest[0]))) metrics = ['accuracy'] global BestModelFile if typeRNN == "FFW": print("Creation du model FFW") model = getFFWModel(shapeRNN, tailleVecteurDocToVec, activationType, metrics) BestModelFile = "GenerationFichier/Models/RNN/" + nameModel + ".hdf5" print("Entrainement du model") # Lancement de l'entrainement callbackMetric = Metrics() model.fit(vectorTrain, trainLabel, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[callbackMetric] ,validation_data=(vectorTest, testLabel)) if selectBestModel: model = load_model(BestModelFile) score = model.evaluate(vectorTest, testLabel, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) results = model.predict(vectorToPredict) labelResult = [] for result in results: listResult = result.tolist() indexMax = listResult.index(max(listResult)) labelResult.append(str(indexMax) + "\n") WriteInFile("GenerationFichier/RNNtestResult.txt", labelResult) elif typeRNN =="Committee": models = [] print("Creation du commitée (model FFW)") for i in range(0,TailleDuComitee): model = getFFWModel(shapeRNN, tailleVecteurDocToVec, activationType, metrics) if selectBestModel: BestModelFile = "GenerationFichier/Models/RNN/" + nameModel + str(i) + ".hdf5" callbackMetric = Metrics() model.fit(vectorTrain, trainLabel, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[callbackMetric] ,validation_data=(vectorTest, testLabel)) if selectBestModel: models.append(load_model(BestModelFile)) else: models.append(model) print(str((i+1)*(100/TailleDuComitee)) + "%") expertsResult = [] TailleDuComitee = len(models) for numExpert in range(0,TailleDuComitee): score = models[numExpert].evaluate(vectorTest, testLabel, verbose=0) poids = score[1] resultExpert = models[numExpert].predict(vectorToPredict).tolist() for y in range(0,len(resultExpert)): if len(expertsResult) == y: expertsResult.append([i*poids for i in resultExpert[y]]) else: for z in range(0, len(expertsResult[y])): expertsResult[y][z] += (resultExpert[y][z] * poids) labelResult = [] for result in expertsResult: indexMax = result.index(max(result)) labelResult.append(str(indexMax) + "\n") WriteInFile("GenerationFichier/RNNtestResult.txt", labelResult) else: raise ValueError("Le model n'est pas reconnu : vérifié la valeur du type de réseau neuronal demandé")