def X_phrases_extraction(x): resultat = [] s = parsetree(x) for sentence in s: for chunk in sentence.chunks: # print (chunk.words) resultat.append([chunk.type, ' '.join([w.string for w in chunk.words])]) return resultat
def lemmatiseur(token): if langueDuTexte == 'english': lemmatizer = WordNetLemmatizer() lemma = lemmatizer.lemmatize(token) elif langueDuTexte == 'french': #parsetree déjà parametré pr francais lemma = parsetree(token, lemmata=True) return lemma
def __new__(self, txt): tree = parsetree(txt) if not len(tree): return False else: sentence = tree[0] # if not sentence.is_question: # return False relevant_nouns = [n for n in sentence.nouns if n.string not in self.STOPWORDS] if len(relevant_nouns): return relevant_nouns[-1].chunk.string else: return False
def get_arrets(self): """ Cette methode prend en parametre une chaine de caractere et retourne, s'il y en a, le nom des arrets """ # initialisation stop_found = [] irrelevent = ['GARE', 'SAINT', 'SAINTE'] accepted_tags = ['NN', 'NNP', 'NNS'] stop = self.Lex_learn['arrets'] tax = [] # apprentissage du lexique des arrets for l in stop: l_split = l.split('|') tax.append(l_split[0]) if len(l_split[0]) == len(l): tax.append(l_split[0]) else: tax.append(l_split[0]) tax.extend(l_split[1].split(',')) for a in tax: a = suppr_ret_char(a) taxonomy.append(a, type='ARRET') # recherche des mots cles dans le tweet (self.s) s = self.s t = parsetree(s) s = search('ARRET', t) stop_found = [] for m in s: for w in m.words: if w.tag in accepted_tags and len( w.string) > 2 and not w.string.upper() in irrelevent: stop_found.append(w.string) elif self._is_in_lev(w.string.upper(), self.Lex_arret): stop_found.append(w.string) # recherche des arrets composes # pas encore fonctionel to_remove = [] compound_found = [] for i in range(0, len(stop_found)): for j in range(i, len(stop_found)): if self._is_neighbour(stop_found[i], stop_found[j]): w_compound = stop_found[i] + " " + stop_found[j] compound_found.append(w_compound) to_remove.append(stop_found[i]) to_remove.append(stop_found[j]) stop_found.extend(compound_found) to_remove = list(set(to_remove)) for w in to_remove: stop_found.remove(w) # traduction des arrets trouves en arrets reels for i in range(0, len(stop_found)): stop_found[i] = self._to_tbc_arret(stop_found[i])[0] # suppression des arrets non coherents try: stop_found.remove('AAAA') return list(set(stop_found)) except: return list(set(stop_found))
# pprint(pd) # pos_pattern = { 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'NNP', 'NNPS' } pos_tt = { 'ABR', 'ADJ', 'ADV', 'NOM', 'VER:cond', 'VER:futu', 'VER:impe', 'VER:impf', 'VER:infi', 'VER:pper', 'VER:ppre', 'VER:pres', 'VER:simp', 'VER:subi', 'VER:subp' } sourcetext = open(sourcefile, 'r').read().decode('utf-8') model = gensim.models.Word2Vec.load(w2vfile) tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') s = aspell.Speller('lang', 'fr') text = parsetree(sourcetext, relations=True, lemmata=True) for sentence in text: newsentence = list() # print sentence.string.encode('utf-8') # print('-------') parsed = tagger.tag_text(sentence.string) for word in parsed: [mot, pos, lemma] = fix_word(word.split('\t')) if pos in pos_tt and model.vocab.get(lemma): string = mot swap = model.most_similar(positive=[focus2, lemma], negative=[focus1], topn=10) while len(swap) > 0: # go through the top 10 matches and find the # first one that (1) has a pos (2) that is valid and (3) is a real