def data_preparation(self): """ Splits one of Brown, BNC News, Indian corpora into train set and test set Returns: -------- sentences (list): Sentences without POS-tags tagged_sentences (list): Sentences with POS-tags """ if self.corpus == 'brown': tagged_sentences = brown.tagged_sents(categories='news') sentences = brown.sents(categories='news') elif self.corpus == 'bnc': root = find('corpora/bnc') bncnews = TaggedCorpusReader(root, 'bnc-news-wtp.txt', tagset='en-claws') if self.tagset is None: tagged_sentences = bncnews.tagged_sents() elif self.tagset == 'universal': tagged_sentences = bncnews.tagged_sents(tagset=self.tagset) sentences = bncnews.sents() elif self.corpus == 'indian': if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']: tagged_sentences = indian.tagged_sents(f'{self.lang}.pos') sentences = indian.sents(f'{self.lang}.pos') else: print('Language not part of Indian Corpus.') return sentences, tagged_sentences
def analyse_ngram(tranche): corpus_entrainement_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt') train_sents = corpus_entrainement_tuple.tagged_sents() tagger = None tagger = create_tagger(train_sents) sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) #print(corpus_test_tuple.sents()) for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] for couple in phrase_combine: for MI in scores_ngram: if MI['signifiant'] == couple[0][0]: MI['total_signifiant'] += 1 if couple[0][1] == 'M': MI['total_MI'] += 1 if couple[1][1] == 'M': MI['MI_reperes'] += 1 if couple[1][1] == couple[0][1]: MI['MI_corrects'] += 1
def analyse_SVM(tranche): global scores_SVM ###Preparation des dicts de features### #On va chercher les resustats corpus_entrainement_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') train_sents = corpus_entrainement_tuple.tagged_sents() tagger = None tagger = create_tagger(train_sents) #joblib.dump(tagger, 'etiqueteur_ngrammes.pkl') liste_dictionnaires = [] liste_y = [] ###CONSTRUCTION DU DICTIONNAIRE ENTRAINEMENT### corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_entrainement' + str(tranche + 1) + '.txt') #sert a identifier le feature tag# sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] #print(phrase_combine) indice = 0 for couple in phrase_combine: #print("waaaa" + str(couple)) for MI in scores_SVM: #print(MI) if couple[0][0] == MI['signifiant']: liste_dictionnaires.append( create_dict(phrase_combine, indice)) #print(couple[0][1]) if couple[0][1] == 'M': liste_y.append(1) else: liste_y.append(0) #print("Mot entr") #print(dict_mot) #print('\n') indice += 1 ###CONSTRUCTION DU DICTIONNAIRE TEST#### #corpus_test_tuple = TaggedCorpusReader(dossier_racine, nom_tes) corpus_test_tuple = TaggedCorpusReader( dossier_racine, 'resultats\/corpus_test' + str(tranche + 1) + '.txt') #sert a identifier le feature tag# sents_corrects = corpus_test_tuple.tagged_sents() sents_tagges = tagger.tag_sents(corpus_test_tuple.sents()) liste_dictionnaires_test = [] liste_y_test = [] for sent_correct, sent_tagge in zip(sents_corrects, sents_tagges): phrase_combine = [ (mot_correct, mot_tagge) for mot_correct, mot_tagge in zip(sent_correct, sent_tagge) ] #print(phrase_combine) indice = 0 for couple in phrase_combine: for MI in scores_SVM: if couple[0][0] == MI['signifiant']: liste_dictionnaires_test.append( create_dict(phrase_combine, indice)) if couple[0][1] == 'M': liste_y_test.append(1) else: liste_y_test.append(0) #print(dict_mot) #print('\n') indice += 1 #vectoriation des dictionnaires### vec = DictVectorizer() listes_colles = liste_dictionnaires + liste_dictionnaires_test vecteur_x_ent_plus_test = vec.fit_transform(listes_colles).toarray() #joblib.dump(vec, 'vectoriseur.pkl') #print(vec.get_feature_names()) #print(vecteur_x_ent_plus_test) vecteur_x_entrainement = vecteur_x_ent_plus_test[:len(liste_dictionnaires)] vecteur_x_test = vecteur_x_ent_plus_test[len(liste_dictionnaires):] clf = svm.SVC(kernel='linear', C=18, class_weight={1: 3}) #BEST equilibre #0,9211 0,9574 #Total signifiants 4185, Fmesure obtenue 0,9389 print(clf.get_params()) clf.fit(vecteur_x_entrainement, liste_y) #joblib.dump(clf, 'classifieur_SVM.pkl') #print(vecteur_x_test) prediction = clf.predict(vecteur_x_test) #print(liste_y_test) #print(prediction) double_y = zip(liste_y_test, prediction) """#pour utiliser sans signifiant dans dict scores_total = {'signifiant': "toute", 'total_signifiant':0, 'total_MI':0, 'MI_reperes':0, 'MI_corrects':0 } """ for unite, couple_reponse in zip(liste_dictionnaires_test, double_y): #print(unite) #print(couple_reponse) for M in scores_SVM: #print(MI) if M['signifiant'] == unite['signifiant']: M['total_signifiant'] += 1 if couple_reponse[0] == 1: M['total_MI'] += 1 if couple_reponse[1] == 1: M['MI_reperes'] += 1 if couple_reponse[0] == couple_reponse[1]: M['MI_corrects'] += 1