def only_stems(keywords): st = PorterStemmer() os = OrengoStemmer() ss = SavoyStemmer() rs = RSLPStemmer() stem1 = [st.getWordStem(x.encode('utf8')) for x in keywords] stem2 = [rs.stem(x.encode('utf8')) for x in keywords] stem3 = [os.getWordStem(x.encode('utf8')) for x in keywords] stem4 = [ss.getWordStem(x.encode('utf8')) for x in keywords] return stem1+stem2+stem3+stem4
def stem(caller, word): global _orengostemmer lang = getattr(caller, "lang", "en") if lang == "en": return porter2.stem(word) elif lang == "pt": if _orengostemmer is None: from ptstemmer.implementations.OrengoStemmer import OrengoStemmer _orengostemmer = OrengoStemmer() return _orengostemmer.getWordStem(word) else: return word
# -*- coding: LATIN-1 -*- ''' * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira * * This file is part of PTStemmer. * PTStemmer is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PTStemmer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>. ''' from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer from ptstemmer.support import PTStemmerUtilities if __name__ == '__main__': s = OrengoStemmer() #or PorterStemmer or SavoyStemmer s.enableCaching(1000) s.ignore(PTStemmerUtilities.fileToSet("")) stem = s.getWordStem("ciências") print(PTStemmerUtilities.removeDiacritics(stem)) print(s.getWordStem("extremamente"))
#!/usr/bin/env python """ * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira * * This file is part of PTStemmer. * PTStemmer is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PTStemmer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>. """ from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer if __name__ == "__main__": s = OrengoStemmer() # s = PorterStemmer() # s = SavoyStemmer() s.enableCaching(1000) s.ignore(["a", "e"]) print s.getWordStem("extremamente")
#!/usr/bin/env python ''' * PTStemmer - A Stemming toolkit for the Portuguese language (C) 2008-2010 Pedro Oliveira * * This file is part of PTStemmer. * PTStemmer is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PTStemmer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with PTStemmer. If not, see <http://www.gnu.org/licenses/>. ''' from ptstemmer.implementations.OrengoStemmer import OrengoStemmer from ptstemmer.implementations.SavoyStemmer import SavoyStemmer from ptstemmer.implementations.PorterStemmer import PorterStemmer from ptstemmer.support import PTStemmerUtilities if __name__ == '__main__': s = OrengoStemmer() #or PorterStemmer or SavoyStemmer s.enableCaching(1000) s.ignore(PTStemmerUtilities.fileToSet("")) stem = s.getWordStem("ciências") print PTStemmerUtilities.removeDiacritics(stem) print s.getWordStem("extremamente")