def ChargementMotsGrossier():
    listeMotsGrossier = txtFileToListe(
        "Data/MotsGrossiers/MGFromWikipedia.txt")
    listeMotsGrossier += txtFileToListe(
        "Data/MotsGrossiers/MGSupplementaire.txt")

    global MotsGrossiers
    for motGrossier in listeMotsGrossier:
        motGrossierNormaliser = normalisationSimple(motGrossier)
        if len(motGrossierNormaliser.split()) == 1:
            MotsGrossiers[motGrossierNormaliser] = []
    listeMGnormaliser = []
    for motGrossier in MotsGrossiers.keys():
        listeMGnormaliser.append(motGrossier + "\n")
    WriteInFile("Normalisation/MG.txt", listeMGnormaliser)
Beispiel #2
0
def DataPredictionToSMV(filename, index):
    svmFilename = "GenerationFichier/SVM/DT.svm"
    if os.path.isfile(svmFilename):
        os.remove(svmFilename)
    lignes = txtFileToListe("Normalisation/texteDT.txt")
    for line in lignes:
        lineToSVMfile(svmFilename, line, '0', index)
    return index
Beispiel #3
0
def TXTtoSVM(filename, svmFilename, index, labelFilename=None):
    if os.path.isfile(svmFilename):
        os.remove(svmFilename)

    print("Lancement de la transformation du fichier " + filename)
    lignes = txtFileToListe(filename)
    labels = []

    if labelFilename == None:
        nbrLignes = len(lignes)
        labels = [
            0
        ] * nbrLignes  # https://www.geeksforgeeks.org/python-which-is-faster-to-initialize-lists/
    else:
        labels = txtFileToListe(labelFilename)

    for i in range(len(labels)):
        lineToSVMfile(svmFilename, lignes[i], labels[i], index)
def ChargementMotsVide():
    # Récupéré tous les fichiers d'un dossier :
    # https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
    chemin = "Data/StopWord/"
    listeFichiers = [f for f in listdir(chemin) if isfile(join(chemin, f))]
    global ListeMotVide
    for fichier in listeFichiers:
        path = join(chemin, fichier)

        listeMotVide = txtFileToListe(path)

        for mot in listeMotVide:
            MVnormaliser = normalisationSimple(mot)
            if len(MVnormaliser.split()) == 1:
                ListeMotVide[MVnormaliser] = []
Beispiel #5
0
def GetPolarityByWord():
    global WordPolarity
    dataWordPolarity = {}
    listePolarite = txtFileToListe("Normalisation/Polarite.txt")
    for polarite in listePolarite:
        polarite = polarite.replace("\n", '')
        data = polarite.split(";")
        mot = data[0]
        polariteMot = [int(data[1]), int(data[2]), int(data[3])]
        if dataWordPolarity.has_key(mot):
            polariteDejaExistante = dataWordPolarity[mot]
            dataWordPolarity[mot] = [
                polariteDejaExistante[0] + polariteMot[0],
                polariteDejaExistante[1] + polariteMot[1],
                polariteDejaExistante[2] + polariteMot[2]
            ]
        else:
            dataWordPolarity[mot] = polariteMot
    listeMotGrossier = txtFileToListe("Normalisation/MG.txt")
    for grossiertee in listeMotGrossier:
        WordPolarity[grossiertee] = [0, 0, 1]

    for mot in dataWordPolarity.keys():
        WordPolarity[mot] = PolarityFromValues(dataWordPolarity[mot], mot)
def Normalise_DonneeTest(TypeNormalisation="Simple"):
    fileName = "Data/donneeTest.txt"
    print("Lancement de la normalisation de " + fileName)
    lignesSource = txtFileToListe(fileName)
    lignes = []
    ids = []
    for ligneSource in lignesSource:
        ligne = ""
        texte = ligneSource[19:].replace('\n', '')
        id = ligneSource[:18] + "\n"
        message = normalisation(texte, TypeNormalisation)
        mots = message.split()
        for mot in mots:
            ligne += mot + " "
        lignes.append(ligne + "\n")
        ids.append(id)
    WriteInFile("Normalisation/idsDT.txt", ids)
    WriteInFile("Normalisation/texteDT.txt", lignes)
def GetModelFromDocToVec(filenames, pcTrain, vector_size, epochs, testing=False):
    docs = []
    for filename in filenames:
        print("Lectures des données sur le fichier " + filename)
        docs += txtFileToListe(filename, withSpaceTreatment=True)
    nbrTrain = int((float(len(docs)) / 100) * pcTrain)

    trainData = CorpusToDocAndToken(docs[:nbrTrain])
    testData = CorpusToDocAndToken(docs[nbrTrain:], tokenOnly=True)
    
    
    print("Taille du corpus d'apprentissage : " + str(len(trainData)))
    print("Taille du corpus de teste : " + str(len(testData)))

    print("Apprentissage du model")
    model = Doc2Vec(vector_size=vector_size, min_count=1, epochs=epochs)
    model.build_vocab(trainData)
    model.train(trainData, total_examples=model.corpus_count, epochs=model.epochs)
    if testing:
        Testing(testData, trainData, model)
    return model
Beispiel #8
0
def UnlabeledTweetToPolarity():
    global WordPolarity
    listeTweet = txtFileToListe("Normalisation/70kTweet.txt")
    nbrBad = 0
    nbrCool = 0
    nbrNormal = 0
    nbrMixte = 0
    listTweetLabelliser = []
    listTweetLabel = []
    for tweet in listeTweet:
        tweet.replace("\n", '')
        polarite = [0, 0, 0]
        for mot in tweet.split():
            if WordPolarity.has_key(mot):
                polarite = [
                    polarite[0] + WordPolarity[mot][0],
                    polarite[1] + WordPolarity[mot][1],
                    polarite[2] + WordPolarity[mot][2]
                ]

        if polarite[2] > 2:
            #print("Mechant : " + str(polarite) + " || " + tweet)
            listTweetLabelliser.append(tweet + "\n")
            listTweetLabel.append("3" + "\n")
            nbrBad += 1
        elif polarite[0] > 5 and polarite[2] == 0:
            #print("Gentil : " + str(polarite) + " || " + tweet)
            listTweetLabelliser.append(tweet + "\n")
            listTweetLabel.append("2" + "\n")
            nbrCool += 1
        elif polarite[0] == 0 and polarite[2] == 0 and polarite[1] > 3:
            print("Neutre : " + str(polarite) + " || " + tweet)
            listTweetLabelliser.append(tweet + "\n")
            listTweetLabel.append("0" + "\n")
            nbrNormal += 1

    #WriteInFile("Normalisation/SVMunlabeled.txt", listTweetLabelliser)
    #WriteInFile("Normalisation/SVMunlabeledlabel.txt", listTweetLabel)
    print("Bad = " + str(nbrBad) + "| Cool = " + str(nbrCool) + "| neutre = " +
          str(nbrNormal) + "| mixte = " + str(nbrMixte))
def EnleverMotVideSpecifique(fichier):
    pathFile = "Normalisation/" + fichier + ".txt"
    index = IndexBuilder()
    lignes = txtFileToListe(pathFile, withSpaceTreatment=True)
    for ligne in lignes:
        for mot in ligne.split():
            index.AddElem(mot)
    nbrOccurences = index.GetNombreOccurence()
    pc10 = int(float(len(lignes)) / 100 * 10)
    motVideSpecifique = {}
    for nbrOccurence in nbrOccurences:
        occurence = nbrOccurence[0]
        nbr = nbrOccurence[1][0]
        if nbr > pc10:
            motVideSpecifique[occurence] = 0
    res = ""
    for ligne in lignes:
        for mot in ligne.split():
            if not motVideSpecifique.has_key(mot):
                res += mot + " "
        res += "\n"
    print(motVideSpecifique)
    WriteInFile(pathFile, res)
Beispiel #10
0
def RNNscript(fichierEntrainement, predictFile, pourcentageTrain, shapeRNN=[200,150,100], nameModel="RNNmodel", typeRNN="FFW", TailleDuComitee=0, activationType='relu', batch_size=32, epochs=10, SelectBestModel=False, FMaxValue=0):
    global listesFichiersEntrainement, tailleVecteurDocToVec, pcTrain, epochsDocToVec, testingDocToVec, selectBestModel, LastResult
    selectBestModel = SelectBestModel
    if typeRNN == "Committee" and TailleDuComitee == 0:
        print("Nombre d'expert non indiqué, veuillez renseigner le parametre TailleDuComitee ")
        quit()
    docToVecModel = GetModelFromDocToVec(listesFichiersEntrainement, pcTrain, tailleVecteurDocToVec, epochsDocToVec, testing=testingDocToVec)
    docToVecModel.save("GenerationFichier/Models/DocToVec/" + nameModel + "_DocToVec")
    print("Lancement de la création du RNN")
    #except expression as identifier:
    fichierEntrainement = "Normalisation/" + fichierEntrainement + ".txt"
    fichierPredict = "Normalisation/" + predictFile + ".txt"
    fichierLabel = fichierEntrainement.replace(".txt", "label.txt")

    datas = txtFileToListe(fichierEntrainement, withSpaceTreatment=True)
    predictData = txtFileToListe(fichierPredict, withSpaceTreatment=True)
    labelDatas = txtFileToListe(fichierLabel, withSpaceTreatment=True)
    nbrTrain = int((float(len(datas)) / 100) * pourcentageTrain)

    trainData = datas[:nbrTrain]
    trainLabel = labelDatas[:nbrTrain]
    testData = datas[nbrTrain:]
    testLabel = labelDatas[nbrTrain:]

    vectorTrain = [vectorFromDataList(trainData, docToVecModel)]
    vectorToPredict = [vectorFromDataList(predictData, docToVecModel)]
    vectorTest = [vectorFromDataList(testData, docToVecModel)]

    trainLabel = np_utils.to_categorical(trainLabel, 4)
    testLabel = np_utils.to_categorical(testLabel, 4)

    print("Nombre de données d'entrainement : " + str(len(vectorTrain[0])))
    print("Nombre de données de teste : " + str(len(vectorTest[0])))
    metrics = ['accuracy'] 
    global BestModelFile

    if typeRNN == "FFW":
        print("Creation du model FFW")
        model = getFFWModel(shapeRNN, tailleVecteurDocToVec, activationType, metrics)
        BestModelFile = "GenerationFichier/Models/RNN/" + nameModel + ".hdf5"
        print("Entrainement du model")
        # Lancement de l'entrainement
        callbackMetric = Metrics()
        model.fit(vectorTrain, trainLabel, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[callbackMetric] ,validation_data=(vectorTest, testLabel))
        if selectBestModel:
            model = load_model(BestModelFile)
        score = model.evaluate(vectorTest, testLabel, verbose=0)
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
        results = model.predict(vectorToPredict)
        labelResult = []
        for result in results:
            listResult = result.tolist()
            indexMax = listResult.index(max(listResult))
            labelResult.append(str(indexMax) + "\n")
        WriteInFile("GenerationFichier/RNNtestResult.txt", labelResult)

    elif typeRNN =="Committee":
        models = []
        print("Creation du commitée (model FFW)")
        for i in range(0,TailleDuComitee):
            model = getFFWModel(shapeRNN, tailleVecteurDocToVec, activationType, metrics)
            if selectBestModel:
                BestModelFile = "GenerationFichier/Models/RNN/" + nameModel + str(i) + ".hdf5"
            callbackMetric = Metrics()
            model.fit(vectorTrain, trainLabel, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[callbackMetric] ,validation_data=(vectorTest, testLabel))
            if selectBestModel:
                models.append(load_model(BestModelFile))
            else:
                models.append(model)
            
            
            print(str((i+1)*(100/TailleDuComitee)) + "%")
        expertsResult = []
        TailleDuComitee = len(models)
        for numExpert in range(0,TailleDuComitee):
            score = models[numExpert].evaluate(vectorTest, testLabel, verbose=0)
            poids = score[1]
            resultExpert = models[numExpert].predict(vectorToPredict).tolist()
            for y in range(0,len(resultExpert)):
                if len(expertsResult) == y:
                    expertsResult.append([i*poids for i in resultExpert[y]])
                else: 
                    for z in range(0, len(expertsResult[y])):
                        expertsResult[y][z] += (resultExpert[y][z] * poids)
        labelResult = []
        for result in expertsResult:
            indexMax = result.index(max(result))
            labelResult.append(str(indexMax) + "\n")
        WriteInFile("GenerationFichier/RNNtestResult.txt", labelResult)

    else:
        raise ValueError("Le model n'est pas reconnu : vérifié la valeur du type de réseau neuronal demandé")