def Normalise_Json(TypeNormalisation="Simple"):
    data = None
    print("Lancement de la normalisation de Data/tweetsAnnotate.json")
    with open("Data/tweetsAnnotate.json") as file:
        data = json.load(file)
    tweets = data["tweets"]
    listeTweet = []
    listeTweetlabel = []
    for tweet in tweets:
        listeTweet.append(
            normalisation(tweet["message"],
                          TypeNormalisation=TypeNormalisation) + "\n")
        listeTweetlabel.append(
            str(LabelConverter(normalisationSimple(tweet["polarity"]))) + "\n")

    WriteInFile("Normalisation/JSONdata.txt", listeTweet)
    WriteInFile("Normalisation/JSONdatalabel.txt", listeTweetlabel)
def Normalise_DonneeTest(TypeNormalisation="Simple"):
    fileName = "Data/donneeTest.txt"
    print("Lancement de la normalisation de " + fileName)
    lignesSource = txtFileToListe(fileName)
    lignes = []
    ids = []
    for ligneSource in lignesSource:
        ligne = ""
        texte = ligneSource[19:].replace('\n', '')
        id = ligneSource[:18] + "\n"
        message = normalisation(texte, TypeNormalisation)
        mots = message.split()
        for mot in mots:
            ligne += mot + " "
        lignes.append(ligne + "\n")
        ids.append(id)
    WriteInFile("Normalisation/idsDT.txt", ids)
    WriteInFile("Normalisation/texteDT.txt", lignes)
def Normalise_DataDeft2017(TypeNormalisation="Simple"):

    fileName = "Data/data_deft2017/task1-train.csv"
    print("Lancement de la normalisation de " + fileName)
    lignes = []
    labels = []
    with open(fileName) as file:
        for line in file:
            ligne = ""
            data = line.split('\t')
            if (len(data) > 1):
                message = normalisation(data[1],
                                        TypeNormalisation=TypeNormalisation)
                labelStr = data[2].replace('\n', '')
                label = str(LabelConverter(labelStr.replace('\r', ''))) + "\n"
                mots = message.split()
                for mot in mots:
                    ligne += mot + " "
                lignes.append(ligne + "\n")
                labels.append(label)
    WriteInFile("Normalisation/SVMtrain.txt", lignes)
    WriteInFile("Normalisation/SVMtrainlabel.txt", labels)

    lignes = []
    labels = []
    fileName = "Data/data_deft2017/task1-testGold.csv"
    print("Lancement de la normalisation de " + fileName)
    with open(fileName) as file:
        for line in file:
            ligne = ""
            data = line.split('\t')
            if (len(data) > 1):
                message = normalisation(data[1])
                labelStr = data[2].replace('\n', '')
                label = str(LabelConverter(labelStr.replace('\r', ''))) + "\n"
                mots = message.split()
                for mot in mots:
                    ligne += mot + " "
                lignes.append(ligne + "\n")
                labels.append(label)
    WriteInFile("Normalisation/SVMtest.txt", lignes)
    WriteInFile("Normalisation/SVMtestlabel.txt", labels)
def ChargementMotsGrossier():
    listeMotsGrossier = txtFileToListe(
        "Data/MotsGrossiers/MGFromWikipedia.txt")
    listeMotsGrossier += txtFileToListe(
        "Data/MotsGrossiers/MGSupplementaire.txt")

    global MotsGrossiers
    for motGrossier in listeMotsGrossier:
        motGrossierNormaliser = normalisationSimple(motGrossier)
        if len(motGrossierNormaliser.split()) == 1:
            MotsGrossiers[motGrossierNormaliser] = []
    listeMGnormaliser = []
    for motGrossier in MotsGrossiers.keys():
        listeMGnormaliser.append(motGrossier + "\n")
    WriteInFile("Normalisation/MG.txt", listeMGnormaliser)
def Normalise_Unlabeled(TypeNormalisation="Simple"):

    fileName = "Data/unlabeled.xml"
    print("Lancement de la normalisation de " + fileName)
    tree = etree.parse(fileName)
    root = tree.xpath("/root")[0]
    lignesDic = {}
    nbTweet = 0
    for tweet in root.getchildren():
        if nbTweet % 7200 == 0:
            print(str((nbTweet / 7200) + 1) + "0% Effectuer")
        nbTweet += 1
        elemMessage = tweet.find("message")
        message = normalisation(elemMessage.text, TypeNormalisation)
        message = message + "\n"
        if not lignesDic.has_key(message):
            lignesDic[message] = 0
    WriteInFile("Normalisation/70kTweet.txt", lignesDic)
def EnleverMotVideSpecifique(fichier):
    pathFile = "Normalisation/" + fichier + ".txt"
    index = IndexBuilder()
    lignes = txtFileToListe(pathFile, withSpaceTreatment=True)
    for ligne in lignes:
        for mot in ligne.split():
            index.AddElem(mot)
    nbrOccurences = index.GetNombreOccurence()
    pc10 = int(float(len(lignes)) / 100 * 10)
    motVideSpecifique = {}
    for nbrOccurence in nbrOccurences:
        occurence = nbrOccurence[0]
        nbr = nbrOccurence[1][0]
        if nbr > pc10:
            motVideSpecifique[occurence] = 0
    res = ""
    for ligne in lignes:
        for mot in ligne.split():
            if not motVideSpecifique.has_key(mot):
                res += mot + " "
        res += "\n"
    print(motVideSpecifique)
    WriteInFile(pathFile, res)
def Normalise_PolariteMots():
    fileName = "Data/06032019-POLARITY-JEUXDEMOTS-FR.txt"
    print("Lancement de la normalisation de " + fileName)
    listePolarite = []
    nbrLigne10pc = int(1113399 / 10)
    nbrLigne = 0
    with open(fileName) as file:
        for line in file:
            if int(nbrLigne % nbrLigne10pc) == 0:
                print(str(nbrLigne / nbrLigne10pc) + "0% Effectué")
            nbrLigne += 1
            if not "//" in line:
                if line != "\n":
                    data = line.split('"')
                    if (len(data) != 3):
                        print(data)
                    text = normalisationSimple(ISO8859Converter(data[1]))
                    if (len(text.split()) == 1):
                        values = text.replace(' ', '')
                        polValue = data[2]
                        values += polValue
                        listePolarite.append(values)
    WriteInFile("Normalisation/Polarite.txt", listePolarite)
Beispiel #8
0
def RNNscript(fichierEntrainement, predictFile, pourcentageTrain, shapeRNN=[200,150,100], nameModel="RNNmodel", typeRNN="FFW", TailleDuComitee=0, activationType='relu', batch_size=32, epochs=10, SelectBestModel=False, FMaxValue=0):
    global listesFichiersEntrainement, tailleVecteurDocToVec, pcTrain, epochsDocToVec, testingDocToVec, selectBestModel, LastResult
    selectBestModel = SelectBestModel
    if typeRNN == "Committee" and TailleDuComitee == 0:
        print("Nombre d'expert non indiqué, veuillez renseigner le parametre TailleDuComitee ")
        quit()
    docToVecModel = GetModelFromDocToVec(listesFichiersEntrainement, pcTrain, tailleVecteurDocToVec, epochsDocToVec, testing=testingDocToVec)
    docToVecModel.save("GenerationFichier/Models/DocToVec/" + nameModel + "_DocToVec")
    print("Lancement de la création du RNN")
    #except expression as identifier:
    fichierEntrainement = "Normalisation/" + fichierEntrainement + ".txt"
    fichierPredict = "Normalisation/" + predictFile + ".txt"
    fichierLabel = fichierEntrainement.replace(".txt", "label.txt")

    datas = txtFileToListe(fichierEntrainement, withSpaceTreatment=True)
    predictData = txtFileToListe(fichierPredict, withSpaceTreatment=True)
    labelDatas = txtFileToListe(fichierLabel, withSpaceTreatment=True)
    nbrTrain = int((float(len(datas)) / 100) * pourcentageTrain)

    trainData = datas[:nbrTrain]
    trainLabel = labelDatas[:nbrTrain]
    testData = datas[nbrTrain:]
    testLabel = labelDatas[nbrTrain:]

    vectorTrain = [vectorFromDataList(trainData, docToVecModel)]
    vectorToPredict = [vectorFromDataList(predictData, docToVecModel)]
    vectorTest = [vectorFromDataList(testData, docToVecModel)]

    trainLabel = np_utils.to_categorical(trainLabel, 4)
    testLabel = np_utils.to_categorical(testLabel, 4)

    print("Nombre de données d'entrainement : " + str(len(vectorTrain[0])))
    print("Nombre de données de teste : " + str(len(vectorTest[0])))
    metrics = ['accuracy'] 
    global BestModelFile

    if typeRNN == "FFW":
        print("Creation du model FFW")
        model = getFFWModel(shapeRNN, tailleVecteurDocToVec, activationType, metrics)
        BestModelFile = "GenerationFichier/Models/RNN/" + nameModel + ".hdf5"
        print("Entrainement du model")
        # Lancement de l'entrainement
        callbackMetric = Metrics()
        model.fit(vectorTrain, trainLabel, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[callbackMetric] ,validation_data=(vectorTest, testLabel))
        if selectBestModel:
            model = load_model(BestModelFile)
        score = model.evaluate(vectorTest, testLabel, verbose=0)
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
        results = model.predict(vectorToPredict)
        labelResult = []
        for result in results:
            listResult = result.tolist()
            indexMax = listResult.index(max(listResult))
            labelResult.append(str(indexMax) + "\n")
        WriteInFile("GenerationFichier/RNNtestResult.txt", labelResult)

    elif typeRNN =="Committee":
        models = []
        print("Creation du commitée (model FFW)")
        for i in range(0,TailleDuComitee):
            model = getFFWModel(shapeRNN, tailleVecteurDocToVec, activationType, metrics)
            if selectBestModel:
                BestModelFile = "GenerationFichier/Models/RNN/" + nameModel + str(i) + ".hdf5"
            callbackMetric = Metrics()
            model.fit(vectorTrain, trainLabel, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[callbackMetric] ,validation_data=(vectorTest, testLabel))
            if selectBestModel:
                models.append(load_model(BestModelFile))
            else:
                models.append(model)
            
            
            print(str((i+1)*(100/TailleDuComitee)) + "%")
        expertsResult = []
        TailleDuComitee = len(models)
        for numExpert in range(0,TailleDuComitee):
            score = models[numExpert].evaluate(vectorTest, testLabel, verbose=0)
            poids = score[1]
            resultExpert = models[numExpert].predict(vectorToPredict).tolist()
            for y in range(0,len(resultExpert)):
                if len(expertsResult) == y:
                    expertsResult.append([i*poids for i in resultExpert[y]])
                else: 
                    for z in range(0, len(expertsResult[y])):
                        expertsResult[y][z] += (resultExpert[y][z] * poids)
        labelResult = []
        for result in expertsResult:
            indexMax = result.index(max(result))
            labelResult.append(str(indexMax) + "\n")
        WriteInFile("GenerationFichier/RNNtestResult.txt", labelResult)

    else:
        raise ValueError("Le model n'est pas reconnu : vérifié la valeur du type de réseau neuronal demandé")