def _extract_ngrams(self, text, lang): unigrams_lemmas = [] pos_tagged = [] if lang == 'es': sentences = parse_es(text,lemmata=True).split() else: sentences = parse_en(text,lemmata=True).split() for sentence in sentences: for token in sentence: if token[0].lower() not in self.stopwords: unigrams_lemmas.append(token[4]) pos_tagged.append(token[1]) return unigrams_lemmas,pos_tagged
def _extract_ngrams(self, text, lang): unigrams_lemmas = [] unigrams_words = [] pos_tagged = [] if lang == 'es': sentences = list(parse_es(text, lemmata=True).split()) else: sentences = list(parse_en(text, lemmata=True).split()) for sentence in sentences: for token in sentence: if token[0].lower() not in self._stopwords: unigrams_words.append(token[0].lower()) unigrams_lemmas.append(token[4]) pos_tagged.append(token[1]) return unigrams_lemmas, unigrams_words, pos_tagged
def pos_realign(source, target): source = parse_en(source, relations=True, lemmata=True) target = parse_es(target, relations=True, lemmata=True) # pprint_en(source) # print # pprint_es(target) pos_en = tag_en(source.string) pos_es = tag_es(target.string) # print pos_en # print pos_es pos_realigned = [] for idx, e in enumerate(pos_en): try: pos_es.pop(0) word_in_target = [word for (word, pos) in pos_es if pos == e[1]] # print e, word_in_target if word_in_target: pos_realigned.append((e[0], word_in_target[0])) except: break source_out = [source for (source, target) in pos_realigned] target_out = [target for (source, target) in pos_realigned] return source_out, target_out
# coding: utf-8 import os, sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) # By default, parse() uses part-of-speech tags from the Penn Treebank tagset: # http://www.clips.ua.ac.be/pages/penn-treebank-tagset # It is a good idea to study the tagset and its abbreviations for a few minutes. from pattern.en import parse as parse_en print(parse_en("the black cats", chunks=False)) # the/DT black/JJ cat/NNS print("") # ... where DT = determiner, JJ = adjective, NN = noun. # This is true for all languages that Pattern supports: from pattern.de import parse as parse_de from pattern.es import parse as parse_es from pattern.fr import parse as parse_fr from pattern.it import parse as parse_it from pattern.nl import parse as parse_nl print(parse_de("die schwarzen Katzen", chunks=False)) # die/DT schwarze/JJ Katzen/NNS print(parse_es("los gatos negros", chunks=False)) # los/DT gatos/NNS negros/JJ print(parse_fr("les chats noirs", chunks=False)) # les/DT chats/NNS noirs/JJ print(parse_it("i gatti neri", chunks=False)) # i/DT gatti/NNS neri/JJ print(parse_nl("de zwarte katten", chunks=False)) # de/DT zwarte/JJ katten/NNS print("") # In some cases, this means the original tagset is mapped to Penn Treebank:
from __future__ import print_function # coding: utf-8 import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) # By default, parse() uses part-of-speech tags from the Penn Treebank tagset: # http://www.clips.ua.ac.be/pages/penn-treebank-tagset # It is a good idea to study the tagset and its abbreviations for a few minutes. from pattern.en import parse as parse_en print(parse_en("the black cats", chunks=False)) # the/DT black/JJ cat/NNS print() # ... where DT = determiner, JJ = adjective, NN = noun. # This is true for all languages that Pattern supports: from pattern.de import parse as parse_de from pattern.es import parse as parse_es from pattern.fr import parse as parse_fr from pattern.it import parse as parse_it from pattern.nl import parse as parse_nl print(parse_de("die schwarzen Katzen", chunks=False)) # die/DT schwarze/JJ Katzen/NNS print(parse_es("los gatos negros" , chunks=False)) # los/DT gatos/NNS negros/JJ print(parse_fr("les chats noirs" , chunks=False)) # les/DT chats/NNS noirs/JJ print(parse_it("i gatti neri" , chunks=False)) # i/DT gatti/NNS neri/JJ print(parse_nl("de zwarte katten" , chunks=False)) # de/DT zwarte/JJ katten/NNS print() # In some cases, this means the original tagset is mapped to Penn Treebank: