Ejemplos de NltkTools.stem en Python

Lenguaje de programación: Python

Namespace/Package Name: langtools.nltk.nltktools

Clase / Tipo: NltkTools

Método / Función: stem

Ejemplos en hotexamples.com: 3

Python NltkTools.stem - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de langtools.nltk.nltktools.NltkTools.stem extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

NltkTools(5)

pos_tag(2)

stem(2)

tag_raw(2)

filter_long_sentences(1)

sen_tokenize(1)

starts_with_upper(1)

word_tokenize(1)

Ejemplo n.º 1

Mostrar archivo

class NltkToolsStemmer(LemmatizerWrapper):
    """
    Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer,
    which is English only.

    @warning This is the original implementation as used in our English
             Wikipedia parser. No effort has been made to clean up the
             code, or to fix the hardwired indexing, etc. The data must
             be already POS tagged, and the POS field must be the last one.
    """
    def __init__(self, params):
        self.nt = NltkTools(stem=True)

    def lemmatize(self, tokens):
        # HACK
        for sen_i, sen in enumerate(tokens):
            stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen))
            hard_stemmed = self.nt.stem(
                (((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper()
                   and tok[0][1:].islower() else tok[0]), tok[-1])
                 for tok in sen))
            for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(
                    zip(stemmed, hard_stemmed)):
                tokens[sen_i][tok_i].append(tok_stemmed[2])
                tokens[sen_i][tok_i].append(tok_hard_stemmed[2])

Ejemplo n.º 2

Mostrar archivo

Archivo: tool_wrapper.py Proyecto: WajihCZ/hunmisc

class NltkToolsStemmer(LemmatizerWrapper):
    """
    Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer,
    which is English only.

    @warning This is the original implementation as used in our English
             Wikipedia parser. No effort has been made to clean up the
             code, or to fix the hardwired indexing, etc. The data must
             be already POS tagged, and the POS field must be the last one.
    """
    def __init__(self, params):
        self.nt = NltkTools(stem=True)

    def lemmatize(self, tokens):
        # HACK
        for sen_i, sen in enumerate(tokens):
            stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen))
            hard_stemmed = self.nt.stem((((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen))
            for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(zip(stemmed, hard_stemmed)):
                tokens[sen_i][tok_i].append(tok_stemmed[2])
                tokens[sen_i][tok_i].append(tok_hard_stemmed[2])

Ejemplo n.º 3

Mostrar archivo

pageSep = "%%#PAGE"
actPage = None
starter = False
for line in sys.stdin:
    l = line.strip().decode("utf-8")
    if l.startswith(pageSep):
        if actPage is not None:
            print
        
        actPage = l.split(" ", 1)[1]
        starter = True
        print l.encode("utf-8").replace(" ", "\t", 1)
        print "%%#Field\tTitle"
        titleTokens = nt.word_tokenize(actPage)
        titleTokensWithPos = list(nt.pos_tag(titleTokens))
        stemmedTitleTokens = nt.stem(titleTokensWithPos)
        hardStemmedTitleTokens = list(nt.stem(((x[0][0].lower() + x[0][1:] if x[0][0].isupper() and x[0][1:].islower() else x[0]), x[1]) for x in titleTokensWithPos))
        for i, (tok, pos, stem) in enumerate(stemmedTitleTokens):
            print u"{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(tok, "word", "0", pos, stem, hardStemmedTitleTokens[i][2]).encode("utf-8")
        print
    elif starter and l.startswith("Templates:"):
        try:
            templates = l.split("\t", 1)[1]
            print u"%%#Templates\t{0}".format(templates).encode("utf-8")
        except IndexError:
            pass
    elif starter and l.startswith("REDIRECT"):
        print "%%#Redirect"
    else:
        if starter:
            print "%%#Field\tBody"