def data_preparation(self):
        """
        Splits one of Brown, BNC News, Indian corpora into train set and
        test set

        Returns:
        --------
            sentences (list):
                Sentences without POS-tags
            tagged_sentences (list):
                Sentences with POS-tags
        """
        if self.corpus == 'brown':
            tagged_sentences = brown.tagged_sents(categories='news')
            sentences = brown.sents(categories='news')
        elif self.corpus == 'bnc':
            root = find('corpora/bnc')
            bncnews = TaggedCorpusReader(root,
                                         'bnc-news-wtp.txt',
                                         tagset='en-claws')
            if self.tagset is None:
                tagged_sentences = bncnews.tagged_sents()
            elif self.tagset == 'universal':
                tagged_sentences = bncnews.tagged_sents(tagset=self.tagset)
            sentences = bncnews.sents()
        elif self.corpus == 'indian':
            if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']:
                tagged_sentences = indian.tagged_sents(f'{self.lang}.pos')
                sentences = indian.sents(f'{self.lang}.pos')
            else:
                print('Language not part of Indian Corpus.')
        return sentences, tagged_sentences
Beispiel #2
0
def analyse_ngram(tranche):

    corpus_entrainement_tuple = TaggedCorpusReader(
        dossier_racine,
        'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt')
    corpus_test_tuple = TaggedCorpusReader(
        dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt')

    train_sents = corpus_entrainement_tuple.tagged_sents()

    tagger = None
    tagger = create_tagger(train_sents)

    sents_corrects = corpus_test_tuple.tagged_sents()
    sents_tagges = tagger.tag_sents(corpus_test_tuple.sents())

    #print(corpus_test_tuple.sents())

    for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges):
        phrase_combine = [
            (mot_correct, mot_tagge)
            for mot_correct, mot_tagge in zip(sent_correct, sent_tagge)
        ]

        for couple in phrase_combine:

            for MI in scores_ngram:
                if MI['signifiant'] == couple[0][0]:
                    MI['total_signifiant'] += 1

                    if couple[0][1] == 'M':
                        MI['total_MI'] += 1

                    if couple[1][1] == 'M':
                        MI['MI_reperes'] += 1

                        if couple[1][1] == couple[0][1]:
                            MI['MI_corrects'] += 1
Beispiel #3
0
def analyse_SVM(tranche):

    global scores_SVM

    ###Preparation des dicts de features###
    #On va chercher les resustats
    corpus_entrainement_tuple = TaggedCorpusReader(
        dossier_racine,
        'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt')

    train_sents = corpus_entrainement_tuple.tagged_sents()
    tagger = None
    tagger = create_tagger(train_sents)

    #joblib.dump(tagger, 'etiqueteur_ngrammes.pkl')

    liste_dictionnaires = []
    liste_y = []

    ###CONSTRUCTION DU DICTIONNAIRE ENTRAINEMENT###
    corpus_test_tuple = TaggedCorpusReader(
        dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) +
        '.txt')  #sert a identifier le feature tag#

    sents_corrects = corpus_test_tuple.tagged_sents()

    sents_tagges = tagger.tag_sents(corpus_test_tuple.sents())

    for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges):

        phrase_combine = [
            (mot_correct, mot_tagge)
            for mot_correct, mot_tagge in zip(sent_correct, sent_tagge)
        ]
        #print(phrase_combine)

        indice = 0

        for couple in phrase_combine:

            #print("waaaa" + str(couple))

            for MI in scores_SVM:

                #print(MI)

                if couple[0][0] == MI['signifiant']:

                    liste_dictionnaires.append(
                        create_dict(phrase_combine, indice))
                    #print(couple[0][1])
                    if couple[0][1] == 'M':
                        liste_y.append(1)
                    else:
                        liste_y.append(0)

                    #print("Mot entr")
                    #print(dict_mot)
                    #print('\n')

            indice += 1

    ###CONSTRUCTION DU DICTIONNAIRE TEST####

    #corpus_test_tuple = TaggedCorpusReader(dossier_racine, nom_tes)
    corpus_test_tuple = TaggedCorpusReader(
        dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) +
        '.txt')  #sert a identifier le feature tag#
    sents_corrects = corpus_test_tuple.tagged_sents()
    sents_tagges = tagger.tag_sents(corpus_test_tuple.sents())

    liste_dictionnaires_test = []
    liste_y_test = []

    for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges):

        phrase_combine = [
            (mot_correct, mot_tagge)
            for mot_correct, mot_tagge in zip(sent_correct, sent_tagge)
        ]
        #print(phrase_combine)

        indice = 0

        for couple in phrase_combine:

            for MI in scores_SVM:

                if couple[0][0] == MI['signifiant']:

                    liste_dictionnaires_test.append(
                        create_dict(phrase_combine, indice))
                    if couple[0][1] == 'M':
                        liste_y_test.append(1)
                    else:
                        liste_y_test.append(0)

                    #print(dict_mot)
                    #print('\n')

            indice += 1

    #vectoriation des dictionnaires###
    vec = DictVectorizer()
    listes_colles = liste_dictionnaires + liste_dictionnaires_test

    vecteur_x_ent_plus_test = vec.fit_transform(listes_colles).toarray()

    #joblib.dump(vec, 'vectoriseur.pkl')

    #print(vec.get_feature_names())
    #print(vecteur_x_ent_plus_test)

    vecteur_x_entrainement = vecteur_x_ent_plus_test[:len(liste_dictionnaires)]
    vecteur_x_test = vecteur_x_ent_plus_test[len(liste_dictionnaires):]

    clf = svm.SVC(kernel='linear', C=18, class_weight={1: 3})  #BEST equilibre
    #0,9211 0,9574
    #Total signifiants 4185, Fmesure obtenue 0,9389

    print(clf.get_params())

    clf.fit(vecteur_x_entrainement, liste_y)

    #joblib.dump(clf, 'classifieur_SVM.pkl')

    #print(vecteur_x_test)

    prediction = clf.predict(vecteur_x_test)

    #print(liste_y_test)
    #print(prediction)

    double_y = zip(liste_y_test, prediction)
    """#pour utiliser sans signifiant dans dict
    scores_total = {'signifiant': "toute",
          'total_signifiant':0,
          'total_MI':0,
          'MI_reperes':0,
          'MI_corrects':0
         }
    """

    for unite, couple_reponse in zip(liste_dictionnaires_test, double_y):
        #print(unite)
        #print(couple_reponse)

        for M in scores_SVM:
            #print(MI)
            if M['signifiant'] == unite['signifiant']:
                M['total_signifiant'] += 1

                if couple_reponse[0] == 1:
                    M['total_MI'] += 1

                if couple_reponse[1] == 1:
                    M['MI_reperes'] += 1

                    if couple_reponse[0] == couple_reponse[1]:
                        M['MI_corrects'] += 1