Esempio n. 1
0
def demo():
    from nltk import tokenize, stem

    # Create a simple regular expression based stemmer
    stemmer = stem.RegexpStemmer('ing$|s$|e$', min=4)
    text = "John was eating icecream"
    tokens = text.split()

    # Print the results.
    print stemmer
    for word in tokens:
        print '%20s => %s' % (word, stemmer.stem(word))
    print
Esempio n. 2
0
    def BIO_tag(self, label, list_of_words, sent, tupla):
        
        '''function performing BIO tag inside the tuplas'''
        
        regex = re.compile('[^a-zA-Z]')
        st = stem.RegexpStemmer('ing$|s$|y$|ly$|ed$', min=4)
        
        lenw=0
        for g in list_of_words:
            if g in sent:
                lis = g.split()
                
                if len(lis)>1: # my lis is composed of more words
                    #print(lis)
                    for idxx, word in enumerate(lis):
                        word = regex.sub('', word)
                        word = st.stem(word)
                        # cerco la prima parola della lista nelle tuple della frase, 
                        # cercando non per parola ma per indice della parola e lunghezza della stessa
                        for idx, t in enumerate(tupla):
                            if t[1]==sent.index(g)+lenw:
                                if t[5]==0:
                                    if idxx == 0:
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(word), t[3],'B'+'-'+label, 1))
                                    else:
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(word), t[3], 'I'+'-'+label, 1))
                        lenw+=len(word)+1
                        
                else: # cerco cose composte da una sola parola
                    for idx, t in enumerate(tupla):
                            if regex.sub('', st.stem(t[0])) == g and t[1]==sent.index(g)+lenw:
                                if t[5]==0:
                                    if (idx!=len(tupla)-1 and (tupla[idx+1][3] == 'JJ')):# or tupla[idx+1][3] == 'NN') ):
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'B'+'-'+label, 1))
                                        tupla[idx+1] = ((tupla[idx+1][0], tupla[idx+1][1], tupla[idx+1][2], tupla[idx+1][3], 'I'+'-'+ label, 1))
                                                                            
                                    elif (idx!=0 and ("," not in tupla[idx-1][3]) and (tupla[idx-1][3] == 'JJ')):# or tupla[idx-1][3] == 'NN') ):
                                        tupla[idx-1] = ((tupla[idx-1][0], tupla[idx-1][1], tupla[idx-1][2], tupla[idx-1][3], 'B'+'-'+ label, 1))
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'I'+'-'+label, 1))
                                                
                                    else:
                                        tupla[idx] = ((t[0], sent.index(g)+lenw, sent.index(g)+lenw + len(g), t[3], 'B'+'-'+label, 1))

        return(tupla)
FOR GETTING THE TOKENS ANOTHER FILE IS USED THAT IS STORING AROUND 50 LAKH TOKENS WITH THEIR TAGS CORRESPONDING TO THE TEXT OF THE REVIEWS.

"""

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import glob
from nltk import pos_tag, stem
from math import log10

threshold = 5000

tokenizer = RegexpTokenizer(r'[\w\']+')
get_token = tokenizer.tokenize
snowball = stem.RegexpStemmer('ies$|s$')
swlist = stopwords.words('english')
noun_file_pointer = open("tokenized_noun_file.txt", "w");

noun_postags, tf = [], {}
curr_line = 0
tot = 5000000
percent = 0
print "Reading lines"
pos_tags_file = open("pos_tags_file.txt", "r")
line = pos_tags_file.readline()
while line:
    i = eval(line)
    if i[1].find("NN") != -1:
        noun_postags.append(i[0].strip(".,-?").lower())
    line = pos_tags_file.readline()
Esempio n. 4
0
import os, sys
import string
import nltk
nltk.download('punkt')
nltk.download("stopwords")

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import numpy as np

from nltk import stem
#ps = PorterStemmer()
rxstem = stem.RegexpStemmer('er$|a$|as$|az$')
snowball = stem.snowball.EnglishStemmer()

if __name__ == '__main__':
    current_path = sys.argv[1]
    #current_path= os.getcwd()
    fout = open(sys.argv[2], 'w')
    test = open('check_index.csv', 'r')

    lines = []
    for line in open(os.path.join(current_path, 'title_StackOverflow.txt')):
        line = line[:-1]
        line = line.translate(string.maketrans("", ""), string.punctuation)
        line = nltk.word_tokenize(line)
        line = [
            word for word in line
 def __init__(self):
     self.stemmer = stem.RegexpStemmer(stemmer_rules_tuple, min=6)
 def __init__(self, feature_list):
     self.feature_list = feature_list
     self.snowball = stem.RegexpStemmer('ies$|s$')
Esempio n. 7
0
def test_stemming(word):
    print 'WordNetLemmatizer:', stem.WordNetLemmatizer().lemmatize(word)
    print 'LancasterStemmer:', stem.LancasterStemmer().stem(word)
    print 'PorterStemmer:', stem.PorterStemmer().stem(word)
    print 'RegexpStemmer:', stem.RegexpStemmer('ing$|s$|e$', min=4).stem(word)
    print 'SnowballStemmer:', stem.SnowballStemmer('english').stem(word)