Beispiel #1
0
    def _extract_ngrams(self, text, lang):

        unigrams_lemmas = []
        pos_tagged = []
        if lang == 'es':
            sentences = parse_es(text,lemmata=True).split()
        else:
            sentences = parse_en(text,lemmata=True).split()

        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self.stopwords:
                    unigrams_lemmas.append(token[4])  
                    pos_tagged.append(token[1])        

        return unigrams_lemmas,pos_tagged
Beispiel #2
0
    def _extract_ngrams(self, text, lang):
        unigrams_lemmas = []
        unigrams_words = []
        pos_tagged = []
        if lang == 'es':
            sentences = list(parse_es(text, lemmata=True).split())
        else:
            sentences = list(parse_en(text, lemmata=True).split())

        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])
                    pos_tagged.append(token[1])

        return unigrams_lemmas, unigrams_words, pos_tagged
Beispiel #3
0
def pos_realign(source, target):
    source = parse_en(source, relations=True, lemmata=True)
    target = parse_es(target, relations=True, lemmata=True)
    # pprint_en(source)
    # print
    # pprint_es(target)
    pos_en = tag_en(source.string)
    pos_es = tag_es(target.string)
    # print pos_en
    # print pos_es
    pos_realigned = []
    for idx, e in enumerate(pos_en):
        try:
            pos_es.pop(0)
            word_in_target = [word for (word, pos) in pos_es if pos == e[1]]
            # print e, word_in_target
            if word_in_target:
                pos_realigned.append((e[0], word_in_target[0]))
        except:
            break
    source_out = [source for (source, target) in pos_realigned]
    target_out = [target for (source, target) in pos_realigned]
    return source_out, target_out
Beispiel #4
0
# coding: utf-8
import os, sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

# By default, parse() uses part-of-speech tags from the Penn Treebank tagset:
# http://www.clips.ua.ac.be/pages/penn-treebank-tagset

# It is a good idea to study the tagset and its abbreviations for a few minutes.

from pattern.en import parse as parse_en
print(parse_en("the black cats", chunks=False))  # the/DT black/JJ cat/NNS
print("")

# ... where DT = determiner, JJ = adjective, NN = noun.
# This is true for all languages that Pattern supports:

from pattern.de import parse as parse_de
from pattern.es import parse as parse_es
from pattern.fr import parse as parse_fr
from pattern.it import parse as parse_it
from pattern.nl import parse as parse_nl

print(parse_de("die schwarzen Katzen",
               chunks=False))  # die/DT schwarze/JJ Katzen/NNS
print(parse_es("los gatos negros", chunks=False))  # los/DT gatos/NNS negros/JJ
print(parse_fr("les chats noirs", chunks=False))  # les/DT chats/NNS noirs/JJ
print(parse_it("i gatti neri", chunks=False))  # i/DT gatti/NNS neri/JJ
print(parse_nl("de zwarte katten", chunks=False))  # de/DT zwarte/JJ katten/NNS
print("")

# In some cases, this means the original tagset is mapped to Penn Treebank:
Beispiel #5
0
from __future__ import print_function
# coding: utf-8
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

# By default, parse() uses part-of-speech tags from the Penn Treebank tagset:
# http://www.clips.ua.ac.be/pages/penn-treebank-tagset

# It is a good idea to study the tagset and its abbreviations for a few minutes.

from pattern.en import parse as parse_en
print(parse_en("the black cats", chunks=False))      # the/DT black/JJ cat/NNS
print()


# ... where DT = determiner, JJ = adjective, NN = noun.
# This is true for all languages that Pattern supports:

from pattern.de import parse as parse_de
from pattern.es import parse as parse_es
from pattern.fr import parse as parse_fr
from pattern.it import parse as parse_it
from pattern.nl import parse as parse_nl

print(parse_de("die schwarzen Katzen", chunks=False)) # die/DT schwarze/JJ Katzen/NNS
print(parse_es("los gatos negros"    , chunks=False)) # los/DT gatos/NNS negros/JJ
print(parse_fr("les chats noirs"     , chunks=False)) # les/DT chats/NNS noirs/JJ
print(parse_it("i gatti neri"        , chunks=False)) # i/DT gatti/NNS neri/JJ
print(parse_nl("de zwarte katten"    , chunks=False)) # de/DT zwarte/JJ katten/NNS
print()

# In some cases, this means the original tagset is mapped to Penn Treebank: