Ejemplo n.º 1
0
    def _extract_ngrams(self, text, lang):

        unigrams_lemmas = []
        pos_tagged = []
        if lang == 'es':
            sentences = parse_es(text,lemmata=True).split()
        else:
            sentences = parse_en(text,lemmata=True).split()

        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self.stopwords:
                    unigrams_lemmas.append(token[4])  
                    pos_tagged.append(token[1])        

        return unigrams_lemmas,pos_tagged
Ejemplo n.º 2
0
    def _extract_ngrams(self, text, lang):
        unigrams_lemmas = []
        unigrams_words = []
        pos_tagged = []
        if lang == 'es':
            sentences = list(parse_es(text, lemmata=True).split())
        else:
            sentences = list(parse_en(text, lemmata=True).split())

        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])
                    pos_tagged.append(token[1])

        return unigrams_lemmas, unigrams_words, pos_tagged
Ejemplo n.º 3
0
def pos_realign(source, target):
    source = parse_en(source, relations=True, lemmata=True)
    target = parse_es(target, relations=True, lemmata=True)
    # pprint_en(source)
    # print
    # pprint_es(target)
    pos_en = tag_en(source.string)
    pos_es = tag_es(target.string)
    # print pos_en
    # print pos_es
    pos_realigned = []
    for idx, e in enumerate(pos_en):
        try:
            pos_es.pop(0)
            word_in_target = [word for (word, pos) in pos_es if pos == e[1]]
            # print e, word_in_target
            if word_in_target:
                pos_realigned.append((e[0], word_in_target[0]))
        except:
            break
    source_out = [source for (source, target) in pos_realigned]
    target_out = [target for (source, target) in pos_realigned]
    return source_out, target_out
Ejemplo n.º 4
0
from pattern.en import parse as parse_en
print(parse_en("the black cats", chunks=False))  # the/DT black/JJ cat/NNS
print("")

# ... where DT = determiner, JJ = adjective, NN = noun.
# This is true for all languages that Pattern supports:

from pattern.de import parse as parse_de
from pattern.es import parse as parse_es
from pattern.fr import parse as parse_fr
from pattern.it import parse as parse_it
from pattern.nl import parse as parse_nl

print(parse_de("die schwarzen Katzen",
               chunks=False))  # die/DT schwarze/JJ Katzen/NNS
print(parse_es("los gatos negros", chunks=False))  # los/DT gatos/NNS negros/JJ
print(parse_fr("les chats noirs", chunks=False))  # les/DT chats/NNS noirs/JJ
print(parse_it("i gatti neri", chunks=False))  # i/DT gatti/NNS neri/JJ
print(parse_nl("de zwarte katten", chunks=False))  # de/DT zwarte/JJ katten/NNS
print("")

# In some cases, this means the original tagset is mapped to Penn Treebank:
# e.g., for German (STTS), Spanish (PAROLE), Dutch (WOTAN).

from pattern.de import STTS
from pattern.es import PAROLE
from pattern.nl import WOTAN

print(parse_de("die schwarzen Katzen", chunks=False, tagset=STTS))
print(parse_es("los gatos negros", chunks=False, tagset=PAROLE))
print(parse_nl("de zwarte katten", chunks=False, tagset=WOTAN))
Ejemplo n.º 5
0
from pattern.en import parse as parse_en
print(parse_en("the black cats", chunks=False))      # the/DT black/JJ cat/NNS
print()


# ... where DT = determiner, JJ = adjective, NN = noun.
# This is true for all languages that Pattern supports:

from pattern.de import parse as parse_de
from pattern.es import parse as parse_es
from pattern.fr import parse as parse_fr
from pattern.it import parse as parse_it
from pattern.nl import parse as parse_nl

print(parse_de("die schwarzen Katzen", chunks=False)) # die/DT schwarze/JJ Katzen/NNS
print(parse_es("los gatos negros"    , chunks=False)) # los/DT gatos/NNS negros/JJ
print(parse_fr("les chats noirs"     , chunks=False)) # les/DT chats/NNS noirs/JJ
print(parse_it("i gatti neri"        , chunks=False)) # i/DT gatti/NNS neri/JJ
print(parse_nl("de zwarte katten"    , chunks=False)) # de/DT zwarte/JJ katten/NNS
print()

# In some cases, this means the original tagset is mapped to Penn Treebank:
# e.g., for German (STTS), Spanish (PAROLE), Dutch (WOTAN).

from pattern.de import STTS
from pattern.es import PAROLE
from pattern.nl import WOTAN

print(parse_de("die schwarzen Katzen", chunks=False, tagset=STTS))
print(parse_es("los gatos negros"    , chunks=False, tagset=PAROLE))
print(parse_nl("de zwarte katten"    , chunks=False, tagset=WOTAN))