def demo(): from nltk import tokenize, stem # Create a simple regular expression based stemmer stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4) text = "John was eating icecream" tokens = text.split() # Print the results. print stemmer for word in tokens: print '%20s => %s' % (word, stemmer.stem(word)) print
def BIO_tag(self, label, list_of_words, sent, tupla): '''function performing BIO tag inside the tuplas''' regex = re.compile('[^a-zA-Z]') st = stem.RegexpStemmer('ing$|s$|y$|ly$|ed$', min=4) lenw=0 for g in list_of_words: if g in sent: lis = g.split() if len(lis)>1: # my lis is composed of more words #print(lis) for idxx, word in enumerate(lis): word = regex.sub('', word) word = st.stem(word) # cerco la prima parola della lista nelle tuple della frase, # cercando non per parola ma per indice della parola e lunghezza della stessa for idx, t in enumerate(tupla): if t[1]==sent.index(g)+lenw: if t[5]==0: if idxx == 0: tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(word), t[3],'B'+'-'+label, 1)) else: tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(word), t[3], 'I'+'-'+label, 1)) lenw+=len(word)+1 else: # cerco cose composte da una sola parola for idx, t in enumerate(tupla): if regex.sub('', st.stem(t[0])) == g and t[1]==sent.index(g)+lenw: if t[5]==0: if (idx!=len(tupla)-1 and (tupla[idx+1][3] == 'JJ')):# or tupla[idx+1][3] == 'NN') ): tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'B'+'-'+label, 1)) tupla[idx+1] = ((tupla[idx+1][0], tupla[idx+1][1], tupla[idx+1][2], tupla[idx+1][3], 'I'+'-'+ label, 1)) elif (idx!=0 and ("," not in tupla[idx-1][3]) and (tupla[idx-1][3] == 'JJ')):# or tupla[idx-1][3] == 'NN') ): tupla[idx-1] = ((tupla[idx-1][0], tupla[idx-1][1], tupla[idx-1][2], tupla[idx-1][3], 'B'+'-'+ label, 1)) tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'I'+'-'+label, 1)) else: tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'B'+'-'+label, 1)) return(tupla)
FOR GETTING THE TOKENS ANOTHER FILE IS USED THAT IS STORING AROUND 50 LAKH TOKENS WITH THEIR TAGS CORRESPONDING TO THE TEXT OF THE REVIEWS. """ from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import glob from nltk import pos_tag, stem from math import log10 threshold = 5000 tokenizer = RegexpTokenizer(r'[\w\']+') get_token = tokenizer.tokenize snowball = stem.RegexpStemmer('ies$|s$') swlist = stopwords.words('english') noun_file_pointer = open("tokenized_noun_file.txt", "w"); noun_postags, tf = [], {} curr_line = 0 tot = 5000000 percent = 0 print "Reading lines" pos_tags_file = open("pos_tags_file.txt", "r") line = pos_tags_file.readline() while line: i = eval(line) if i[1].find("NN") != -1: noun_postags.append(i[0].strip(".,-?").lower()) line = pos_tags_file.readline()
import os, sys import string import nltk nltk.download('punkt') nltk.download("stopwords") from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.cluster import KMeans from sklearn.preprocessing import normalize import numpy as np from nltk import stem #ps = PorterStemmer() rxstem = stem.RegexpStemmer('er$|a$|as$|az$') snowball = stem.snowball.EnglishStemmer() if __name__ == '__main__': current_path = sys.argv[1] #current_path= os.getcwd() fout = open(sys.argv[2], 'w') test = open('check_index.csv', 'r') lines = [] for line in open(os.path.join(current_path, 'title_StackOverflow.txt')): line = line[:-1] line = line.translate(string.maketrans("", ""), string.punctuation) line = nltk.word_tokenize(line) line = [ word for word in line
def __init__(self): self.stemmer = stem.RegexpStemmer(stemmer_rules_tuple, min=6)
def __init__(self, feature_list): self.feature_list = feature_list self.snowball = stem.RegexpStemmer('ies$|s$')
def test_stemming(word): print 'WordNetLemmatizer:', stem.WordNetLemmatizer().lemmatize(word) print 'LancasterStemmer:', stem.LancasterStemmer().stem(word) print 'PorterStemmer:', stem.PorterStemmer().stem(word) print 'RegexpStemmer:', stem.RegexpStemmer('ing$|s$|e$', min=4).stem(word) print 'SnowballStemmer:', stem.SnowballStemmer('english').stem(word)