Exemple #1
0
def X_phrases_extraction(x):
    resultat = []
    s = parsetree(x)
    for sentence in s:
        for chunk in sentence.chunks:
    #        print (chunk.words)
            resultat.append([chunk.type, ' '.join([w.string for w in chunk.words])])
    return resultat
Exemple #2
0
def lemmatiseur(token):
	if langueDuTexte == 'english':
		lemmatizer = WordNetLemmatizer()
		lemma = lemmatizer.lemmatize(token)
	elif langueDuTexte == 'french':
		#parsetree déjà parametré pr francais
		lemma = parsetree(token, lemmata=True)
	return lemma
Exemple #3
0
    def __new__(self, txt):
        tree = parsetree(txt)
        if not len(tree):
            return False
        else:
            sentence = tree[0]

        # if not sentence.is_question:
        #     return False

        relevant_nouns = [n for n in sentence.nouns if n.string not in self.STOPWORDS]
        if len(relevant_nouns):
            return relevant_nouns[-1].chunk.string
        else:
            return False
Exemple #4
0
    def get_arrets(self):
        """
        Cette methode prend en parametre une chaine de caractere et retourne, s'il y en a, le nom des arrets 
        """
        # initialisation
        stop_found = []
        irrelevent = ['GARE', 'SAINT', 'SAINTE']
        accepted_tags = ['NN', 'NNP', 'NNS']
        stop = self.Lex_learn['arrets']
        tax = []

        # apprentissage du lexique des arrets
        for l in stop:
            l_split = l.split('|')
            tax.append(l_split[0])
            if len(l_split[0]) == len(l):
                tax.append(l_split[0])
            else:
                tax.append(l_split[0])
                tax.extend(l_split[1].split(','))
        for a in tax:
            a = suppr_ret_char(a)
            taxonomy.append(a, type='ARRET')

        # recherche des mots cles dans le tweet (self.s)
        s = self.s
        t = parsetree(s)
        s = search('ARRET', t)
        stop_found = []
        for m in s:
            for w in m.words:
                if w.tag in accepted_tags and len(
                        w.string) > 2 and not w.string.upper() in irrelevent:
                    stop_found.append(w.string)
                elif self._is_in_lev(w.string.upper(), self.Lex_arret):
                    stop_found.append(w.string)

        # recherche des arrets composes
        # pas encore fonctionel
        to_remove = []
        compound_found = []
        for i in range(0, len(stop_found)):
            for j in range(i, len(stop_found)):
                if self._is_neighbour(stop_found[i], stop_found[j]):
                    w_compound = stop_found[i] + " " + stop_found[j]
                    compound_found.append(w_compound)
                    to_remove.append(stop_found[i])
                    to_remove.append(stop_found[j])

        stop_found.extend(compound_found)
        to_remove = list(set(to_remove))

        for w in to_remove:
            stop_found.remove(w)

        # traduction des arrets trouves en arrets reels
        for i in range(0, len(stop_found)):
            stop_found[i] = self._to_tbc_arret(stop_found[i])[0]

        # suppression des arrets non coherents
        try:
            stop_found.remove('AAAA')
            return list(set(stop_found))
        except:
            return list(set(stop_found))
Exemple #5
0
# pprint(pd)

# pos_pattern = { 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'NNP', 'NNPS' }
pos_tt = {
    'ABR', 'ADJ', 'ADV', 'NOM', 'VER:cond', 'VER:futu', 'VER:impe', 'VER:impf',
    'VER:infi', 'VER:pper', 'VER:ppre', 'VER:pres', 'VER:simp', 'VER:subi',
    'VER:subp'
}

sourcetext = open(sourcefile, 'r').read().decode('utf-8')
model = gensim.models.Word2Vec.load(w2vfile)

tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
s = aspell.Speller('lang', 'fr')

text = parsetree(sourcetext, relations=True, lemmata=True)

for sentence in text:
    newsentence = list()
    #	print sentence.string.encode('utf-8')
    #	print('-------')
    parsed = tagger.tag_text(sentence.string)
    for word in parsed:
        [mot, pos, lemma] = fix_word(word.split('\t'))
        if pos in pos_tt and model.vocab.get(lemma):
            string = mot
            swap = model.most_similar(positive=[focus2, lemma],
                                      negative=[focus1],
                                      topn=10)
            while len(swap) > 0:  # go through the top 10 matches and find the
                # first one that (1) has a pos (2) that is valid and (3) is a real