Esempi in Python per RegexpStemmer

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: nltk.stem

Metodo/funzione: RegexpStemmer

Esempi su hotexamples.com: 7

RegexpStemmer in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per nltk.stem.RegexpStemmer, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: regexp.py Progetto: wrand/tweater

def demo():
    from nltk import tokenize, stem

    # Create a simple regular expression based stemmer
    stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4)
    text = "John was eating icecream"
    tokens = text.split()

    # Print the results.
    print stemmer
    for word in tokens:
        print '%20s => %s' % (word, stemmer.stem(word))
    print

Esempio n. 2

Mostra file

    def BIO_tag(self, label, list_of_words, sent, tupla):
        
        '''function performing BIO tag inside the tuplas'''
        
        regex = re.compile('[^a-zA-Z]')
        st = stem.RegexpStemmer('ing$|s$|y$|ly$|ed$', min=4)
        
        lenw=0
        for g in list_of_words:
            if g in sent:
                lis = g.split()
                
                if len(lis)>1: # my lis is composed of more words
                    #print(lis)
                    for idxx, word in enumerate(lis):
                        word = regex.sub('', word)
                        word = st.stem(word)
                        # cerco la prima parola della lista nelle tuple della frase, 
                        # cercando non per parola ma per indice della parola e lunghezza della stessa
                        for idx, t in enumerate(tupla):
                            if t[1]==sent.index(g)+lenw:
                                if t[5]==0:
                                    if idxx == 0:
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(word), t[3],'B'+'-'+label, 1))
                                    else:
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(word), t[3], 'I'+'-'+label, 1))
                        lenw+=len(word)+1
                        
                else: # cerco cose composte da una sola parola
                    for idx, t in enumerate(tupla):
                            if regex.sub('', st.stem(t[0])) == g and t[1]==sent.index(g)+lenw:
                                if t[5]==0:
                                    if (idx!=len(tupla)-1 and (tupla[idx+1][3] == 'JJ')):# or tupla[idx+1][3] == 'NN') ):
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'B'+'-'+label, 1))
                                        tupla[idx+1] = ((tupla[idx+1][0], tupla[idx+1][1], tupla[idx+1][2], tupla[idx+1][3], 'I'+'-'+ label, 1))
                                                                            
                                    elif (idx!=0 and ("," not in tupla[idx-1][3]) and (tupla[idx-1][3] == 'JJ')):# or tupla[idx-1][3] == 'NN') ):
                                        tupla[idx-1] = ((tupla[idx-1][0], tupla[idx-1][1], tupla[idx-1][2], tupla[idx-1][3], 'B'+'-'+ label, 1))
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'I'+'-'+label, 1))
                                                
                                    else:
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'B'+'-'+label, 1))

        return(tupla)

Esempio n. 3

Mostra file

File: feature_list_extraction.py Progetto: vinsinrawj/FEATURE-EXTRACTION-OF-PRODUCT-REVIEWS-

FOR GETTING THE TOKENS ANOTHER FILE IS USED THAT IS STORING AROUND 50 LAKH TOKENS WITH THEIR TAGS CORRESPONDING TO THE TEXT OF THE REVIEWS.

"""

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import glob
from nltk import pos_tag, stem
from math import log10

threshold = 5000

tokenizer = RegexpTokenizer(r'[\w\']+')
get_token = tokenizer.tokenize
snowball = stem.RegexpStemmer('ies$|s$')
swlist = stopwords.words('english')
noun_file_pointer = open("tokenized_noun_file.txt", "w");

noun_postags, tf = [], {}
curr_line = 0
tot = 5000000
percent = 0
print "Reading lines"
pos_tags_file = open("pos_tags_file.txt", "r")
line = pos_tags_file.readline()
while line:
    i = eval(line)
    if i[1].find("NN") != -1:
        noun_postags.append(i[0].strip(".,-?").lower())
    line = pos_tags_file.readline()

Esempio n. 4

Mostra file

File: cluster.py Progetto: r06942072/ML2016

import os, sys
import string
import nltk
nltk.download('punkt')
nltk.download("stopwords")

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import numpy as np

from nltk import stem
#ps = PorterStemmer()
rxstem = stem.RegexpStemmer('er$|a$|as$|az$')
snowball = stem.snowball.EnglishStemmer()

if __name__ == '__main__':
    current_path = sys.argv[1]
    #current_path= os.getcwd()
    fout = open(sys.argv[2], 'w')
    test = open('check_index.csv', 'r')

    lines = []
    for line in open(os.path.join(current_path, 'title_StackOverflow.txt')):
        line = line[:-1]
        line = line.translate(string.maketrans("", ""), string.punctuation)
        line = nltk.word_tokenize(line)
        line = [
            word for word in line

Esempio n. 5

Mostra file

File: stemmer.py Progetto: undarmaa/mongolian-text-classification

 def __init__(self):
     self.stemmer = stem.RegexpStemmer(stemmer_rules_tuple, min=6)

Esempio n. 6

Mostra file

File: helper_functions.py Progetto: vinsinrawj/FEATURE-EXTRACTION-OF-PRODUCT-REVIEWS-

 def __init__(self, feature_list):
     self.feature_list = feature_list
     self.snowball = stem.RegexpStemmer('ies$|s$')

Esempio n. 7

Mostra file

def test_stemming(word):
    print 'WordNetLemmatizer:', stem.WordNetLemmatizer().lemmatize(word)
    print 'LancasterStemmer:', stem.LancasterStemmer().stem(word)
    print 'PorterStemmer:', stem.PorterStemmer().stem(word)
    print 'RegexpStemmer:', stem.RegexpStemmer('ing$|s$|e$', min=4).stem(word)
    print 'SnowballStemmer:', stem.SnowballStemmer('english').stem(word)