Python LancasterStemmer.stem Exemples, nltk.LancasterStemmer.stem Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : views.py Projet : srinivasraman18/Covid19SearchEngine

def process(word_list):
	lancaster=LancasterStemmer()
	new_list=[]
	for word in word_list:
		w=lancaster.stem(word)
		new_list.append(w)
	return new_list

Exemple #2

0

Afficher le fichier

def Stem(s):
    if s is not None and isinstance(s, str) and len(s) > 0:
        stemmer = LancasterStemmer()
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s = s.lower()
        return s
    else:
        return ""

Exemple #3

0

Afficher le fichier

 def stem_words(self, words):
     """Stem words in list of tokenized words"""
     stemmer = LancasterStemmer()
     stems = ""
     for word in words.split(" "):
         stem = stemmer.stem(word)
         stems = stems + " " + stem
     return stems

Exemple #4

0

Afficher le fichier

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

Exemple #5

0

Afficher le fichier

Fichier : part1.py Projet : Saltzguy/School

def main():
    save_data_from_webpage()
    
    text = get_data_from_file()
  
    
    #creates a list of the tolkenized words
    tt = word_tokenize(text)
    pprint(tt)

    #creates a new list for the steam words using all of the stemmers
    psteam = PorterStemmer()
    psteam_list = []
    for word in tt:
        psteam_list.append(psteam.stem(word))
    pprint(psteam_list)

    lsteam = LancasterStemmer()
    lsteam_list = []
    for word in tt:
       lsteam_list.append(lsteam.stem(word))
    pprint(lsteam_list)

    ssteam = SnowballStemmer()
    ssteam_list = []
    for word in tt:
        ssteam_list.append(ssteam.stem(word))
    pprint(ssteam_list)

    p = set(psteam_list)
    l = set(lsteam_list)
    s = set(ssteam_list)
    #displays the different steams
    pprint(s.difference(l.difference(p)))

    #pos taging
    pos_list = pos_tag(text)
    pprint(pos_list)

    #creates a new list for the lematized words
    lemmatizer = WordNetLemmatizer()
    lem = []
    for word in tt:
        lem.append(lemmatizer.lemmatize(word)) 
    #pprint(lem)
    
    # returns a generator of trigrams using the tokenized list tt
    trig = trigrams(tt)
    displays the results
    print(list(trig))
    
    #ne_chunck finds non overlapping groups
    #pos_tag ids how the text is used in speech
    NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text)))
    print(NamedEntity)

Exemple #6

0

Afficher le fichier

Fichier : text_mining.py Projet : rauswarn/PyEEGLab

 def _normalize(self, item):
     key, value = item
     ls = LancasterStemmer()
     text = word_tokenize(value[0])
     text = [word.lower() for word in text]
     text = [
         ls.stem(word).rstrip('s')
         for word in text
         if word not in stopwords.words('english') and word.isalnum()
     ]
     return (key, (text, value[1]))

Exemple #7

0

Afficher le fichier

def __stem_document(document_name: pathlib.Path) -> list:
    stemmer = LancasterStemmer()
    with document_name.open('r', encoding='utf-8') as document:
        lines = document.readlines()
    result = []
    for line in lines:
        line = line.strip()
        words = [token for token in line.split(' ')]
        words = [stemmer.stem(word) for word in words]
        sentence = ' '.join(words)
        result.append(sentence)
    return result

Exemple #8

0

Afficher le fichier

Fichier : final.py Projet : lfbox7/senior-project

def get_stems(tokens):
    stemmer = LancasterStemmer()
    stemmed_tokens = []
    for token in tokens:
        for word in token:
            if word[1] == 'DT' or word[1] == 'PRP' or word[1] == 'PRP$' or word[
                    1] == 'NN' or word[1] == 'NNP' or word[1] == 'NNPS':
                temp_tokens = word[0]
            else:
                temp_tokens = stemmer.stem(word[0])
            stemmed_tokens.append(temp_tokens)
    return get_lemma(stemmed_tokens)

Exemple #9

0

Afficher le fichier

def getStemsFromURL(page_url):
    '''
    Given the link of a webpage (string), returns a list of 
    all the words' stems in the webpage text
    '''
    with urlopen(page_url) as infile:
        soup = BeautifulSoup(infile, features="lxml")

    ls = LancasterStemmer()
    words = word_tokenize(soup.text)
    words = [w.lower() for w in words]
    words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()]
    return words

Exemple #10

0

Afficher le fichier

Fichier : spelling.py Projet : nelsonauner/ep.lsa

def checkstemmers():
	raw = customparse("C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt")
	wordz = raw.split(" ")
	O = ["sweating","tripping","gunning","going"] 
	HH = [i[0:-1] for i in O] 
	dic = enchant.Dict("en_US") 
	from nltk import LancasterStemmer, PorterStemmer
	lancaster = LancasterStemmer()
	porter = PorterStemmer()
	resporter = [porter.stem(t).replace(" ","") for t in wordz] 
	reslan = [lancaster.stem(t).replace(" ","") for t in wordz]
	resall = [[wordz[i],resporter[i],reslan[i]]  for i in range(len(wordz)) ]
	filtres = [resall[i] for i in range(len(resall)) if not (resall[i][0]==resall[i][2]==resall[i][1])]
	return resall

Exemple #11

0

Afficher le fichier

    def clean_tweets(self, text):
        st = LancasterStemmer()
        #st = PorterStemmer()
        with open('newspaper3k/SmartStoplist.txt', 'r') as f:
            stopwords = [line.strip() for line in f]

            # remove URL's
            text = re.sub(r'http\S+', '', text)
            tweet_tmp = text.split("\n")
            for k in tweet_tmp:
                tweet_tmp = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower()
                tweet_tmp = st.stem(tweet_tmp)
            tweet_tmp = ''.join([i for i in tweet_tmp if not i.isdigit()])
            tweet_tmp = tweet_tmp.split()
            result = [word for word in tweet_tmp if word not in stopwords]
            return result

Exemple #12

0

Afficher le fichier

def checkstemmers():
    raw = customparse(
        "C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt"
    )
    wordz = raw.split(" ")
    O = ["sweating", "tripping", "gunning", "going"]
    HH = [i[0:-1] for i in O]
    dic = enchant.Dict("en_US")
    from nltk import LancasterStemmer, PorterStemmer
    lancaster = LancasterStemmer()
    porter = PorterStemmer()
    resporter = [porter.stem(t).replace(" ", "") for t in wordz]
    reslan = [lancaster.stem(t).replace(" ", "") for t in wordz]
    resall = [[wordz[i], resporter[i], reslan[i]] for i in range(len(wordz))]
    filtres = [
        resall[i] for i in range(len(resall))
        if not (resall[i][0] == resall[i][2] == resall[i][1])
    ]
    return resall

Exemple #13

0

Afficher le fichier

def getMostUsedWordsTxt(file, wordnum):
    '''
    Given a text file name (string) and the number of most
    used words we want to find (int), returns a list of the wordnum
    most common elements and their counts from the most common
    to the least:
    [('1st_most_common_word', count1), 
    ('2nd_most_common_word', count2), 
    ...,
    ('wordnumth_most_common_word', countwordnum)]
    '''
    with open(file, "r") as f:
        words = f.read()
        words = words.split()

    ls = LancasterStemmer()
    words = [w.lower() for w in words]
    words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()]
    freqs = Counter(words)
    return freqs.most_common(wordnum)

Exemple #14

0

Afficher le fichier

Fichier : Classifier.py Projet : recklessPaul94/test

    def tokenize(self, description):

        filtered = []
        # dont process NaN or Null values
        if pd.isnull(description):
            return filtered, filtered
        else:
            terms = description.lower().split()
            # terms = word_tokenize(description.lower().decode('utf-8'))
            filtered_stopwords = [word for word in terms if not word in stopwords.words('english')]

            # # Stemming Snowball
            # stemmer = SnowballStemmer('english')
            # for stem in filtered_stopwords:
            #     filtered.append(stemmer.stem(stem.decode('utf-8')))

            # # Stemming Porter
            # stemmer = PorterStemmer()
            # for stem in filtered_stopwords:
            #     filtered.append(stemmer.stem(stem.decode('utf-8')))

            # Lemmatizer Word Net Lemmatizer
            lemmatizer = WordNetLemmatizer()
            for lemmatized in filtered_stopwords:
                filtered.append(lemmatizer.lemmatize(lemmatized))

            filtered_final = []
            # Stemming Lancaster
            stemmer = LancasterStemmer()
            for stem in filtered:
                # filtered_final.append(stemmer.stem(stem.decode('utf-8')))
                filtered_final.append(stemmer.stem(stem))

            # # Lemmatizer TextBlob
            # for lemmatized in filtered_stopwords:
            #     w = Word(lemmatized.decode('utf-8'))
            #     filtered.append(w.lemmatize)

            return filtered_final

Exemple #15

0

Afficher le fichier

Fichier : generate_detection_old.py Projet : danghermang/antispam_filter

def get_words_from_string(string):
    string = string.lower()
    word_pattern = r'[A-Za-z]+'
    # link_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"
    # email_pattern = r"\S+@\S+"
    # ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
    result = []
    # for x in re.findall(link_pattern, string):
    #     try:
    #         url = "{0.scheme}://{0.netloc}/".format(urlsplit(x))
    #     except:
    #         url = x
    #     result.append(url)
    # string = re.sub(link_pattern, "", string)
    # result.extend(re.findall(email_pattern, string))
    # string = re.sub(email_pattern, "", string)
    # result.extend(re.findall(ip_pattern, string))
    # string = re.sub(ip_pattern, "", string)
    # stemmer = PorterStemmer()
    stemmer = LancasterStemmer()
    result.extend(
        [stemmer.stem(word) for word in re.findall(word_pattern, string)])
    # result.extend(re.findall(word_pattern, string))
    return result

Exemple #16

0

Afficher le fichier

# Tweet loading and cleaning
wrong = 0
with open('corpus.txt', 'r') as f:
    tweets = []
    for line in f.readlines():
        cols = line.replace('\n', '').replace('\ufeff',
                                              '').replace('\t', '').split('|')
        if len(cols) == 2:
            (cat, tweet) = (cols[0], cols[1])
            # Removal of URLs, hashtags and mentions
            tweet_regex = regex_spaces.sub(
                ' ', regex_ht_mn.sub('', regex_url.sub('', tweet))).lower()
            # Removal of caps and accents
            tweet_raw = unidecode.unidecode(tweet_regex).lower()
            tokens = [
                remove_repeated_chars(stemmer.stem(t))
                for t in tweet_tokenizer.tokenize(tweet_regex)
                if not t in stopwords and not regex_nonword.match(t)
            ]
            tweets.append((tokens, cols[0]))
        else:
            wrong += 1
            print(line, end='')

print('Wrong: {0}'.format(wrong))
word_features = get_word_features(get_words_in_tweets(tweets))
tweets_cat = {
    'P': [t for t in tweets if t[1] == 'P'],
    'N': [t for t in tweets if t[1] == 'N'],
    'NEU': [t for t in tweets if t[1] == 'NEU']
}

Exemple #17

0

Afficher le fichier

from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer

pStemmer = PorterStemmer()
lStemmer = LancasterStemmer()
sStemmer = SnowballStemmer('english')

print(pStemmer.stem("Playing"))
print(lStemmer.stem("Dancing"))
print(sStemmer.stem("Killing"))

from nltk.stem import WordNetLemmatizer

lemmetizer = WordNetLemmatizer()
print(lemmetizer.lemmatize("Playing"))
print(lemmetizer.lemmatize("Dancing"))
print(lemmetizer.lemmatize("Killing"))
print(lemmetizer.lemmatize("geese"))

from nltk import wordpunct_tokenize, pos_tag, ne_chunk

sentence = "Mark and John are working at google"
print(wordpunct_tokenize(sentence), '\n')
print(pos_tag(wordpunct_tokenize(sentence)), '\n')
print(ne_chunk(pos_tag(wordpunct_tokenize(sentence))))

Exemple #18

0

Afficher le fichier

doc = docx.Document(arquivo)

# 02 - Lista de parágrafos
texto_full = []
for paragrafo in doc.paragraphs:
    texto_full.append(paragrafo.text)

# Seleção do 2° e 3° parágrafo
p_2e3 = texto_full[2:4]

# Tokenizar a lista de paragrafos
tokens = word_tokenize(' '.join(p_2e3))

## RSLP
rslp = RSLPStemmer()
stemms_rslp = []
for i in tokens:
    stemms_rslp.append(rslp.stem(i))

## Poter
poter = PorterStemmer()
stemms_poter = []
for i in tokens:
    stemms_poter.append(poter.stem(i))

## Lancaster
lancaster = LancasterStemmer()
stemms_lanc = []
for i in tokens:
    stemms_lanc.append(lancaster.stem(i))

Exemple #19

0

Afficher le fichier

提取文本数据的词干

三种词干提取算法，Lancaster词干提取器比其他两个词干提取器更严格
严格程度而言：Porter最轻松，Lancaster最严格。
Lancaster速度快但是会减少单词的很大部分，通常会选择Snowball词干提取器
'''
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer

words = [
    'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches',
    'grounded', 'dreamt', 'envision'
]

stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL']

stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

formatted_row = '{:>16}' * (len(stemmers) + 1)
print(formatted_row.format('WORD', *stemmers))

for word in words:
    stemmed_words = [
        stemmer_porter.stem(word),
        stemmer_lancaster.stem(word),
        stemmer_snowball.stem(word)
    ]
    print(formatted_row.format(word, *stemmed_words))

Exemple #20

0

Afficher le fichier

Fichier : stemming14.py Projet : venkatram64/python_nltk

from nltk import PorterStemmer, LancasterStemmer, word_tokenize

line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\
                          " Software industry working \nfrom applications to products by using \n" \
                          " C, C++, Java, Javascript and databases "\
                          " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB."

tokens = word_tokenize(line)
porter = PorterStemmer()
pStems = [porter.stem(t) for t in tokens]
print(pStems)

print("************************************************")

lancaster = LancasterStemmer()
lStems = [lancaster.stem(t) for t in tokens]
print(lStems)

Exemple #21

0

Afficher le fichier

    'page': TITLE,
    'format': "json"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

# get the text
wiki_page_text = DATA["parse"]["text"]["*"]

h = html2text.HTML2Text()
h.ignore_links = True
page_text = h.handle(wiki_page_text)

# create a new stemmer
ls = LancasterStemmer()

# tokenize text
words = nltk.word_tokenize(page_text)

words = [w.lower() for w in words]

# eliminate stop words and stem the rest of the words
words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]

freqs = Counter(words)

print("The 10 most frequently used stems in the ''Data science'' Wikipedia page are:")
for word, count in freqs.most_common(10):
    print(word, count)

Exemple #22

0

Afficher le fichier

Fichier : utils.py Projet : TadMC/SSAE_Catalog

def MakeFeaturesFromText(DIR, FNAME, SENT_CLASS, max_features):
    '''
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    EXPERIMENTAL:     Still trying to figure out if it produces good
    Data sets. Use at your own risk
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    Grabs a text file and creates a sparse binary data set to training and
    makes class labels based off of the class identifiers in SENT_CLASS

    ARG: DIR          TYPE: string                 DESC: Directory where text file is saved. End with \
    ARG: FNAME          TYPE: string                 DESC: File name
    ARG: SENT_CLASS TYPE: List (of lists)     DESC: A list of lists with each internal list representing a class id
    '''

    print 'STARTING DATA GENERATION'

    from nltk import download as nldl
    from nltk import LancasterStemmer
    import nltk.corpus as corp
    import string

    # used for referencing later
    NUM_CLASSES = len(SENT_CLASS)
    TOTAL_CLASSES = NUM_CLASSES + 1

    # read file and begin cleaning
    TextFile = open(DIR + FNAME, 'r')
    RAW = TextFile.readlines()
    # removes newline, makes it all lower case, and splits it into seperable words by spaces
    CLEAN = [sent[:-1].lower().split(' ') for sent in RAW]

    print 'LOADING STOPWORDS'

    # need the try, catch because it requires the stop words from the nltk database
    # the catch is just for first time running
    try:
        STOP_WORDS = corp.stopwords.words('english')
    except LookupError:
        nldl('words')
        STOP_WORDS = corp.stopwords.words('english')
    finally:
        # stop words are common words like "the", "an" "a" etc..
        STOP_WORDS += ' '.join(p for p in string.punctuation).split(' ') + ' '.join(d for d in string.digits).split(' ') + ' '.join(w for w in string.whitespace).split(' ')
        # removes stopwords and trivially short features
        CLEANER = [[word for word in sent if word not in STOP_WORDS] for sent in CLEAN if len(sent) > 3]
        KEYS = []

        print 'SORTING KEYS'
        # starts compiliing the key list from the id words in sent_class
        for IDS in SENT_CLASS:
            KEYS += IDS

        print 'HEAVY DUTY STUFF'
        # finds all unique words. keeps combining and fitlering duplicates
        for sent in CLEANER:
            TEMP = KEYS + sent
            KEYS = np.unique(TEMP).tolist()

        print 'COUNTING NCOs. . ',
        MAX_LIST = []
        MAX = -1
        KEY_COUNT = {}
        for sent in CLEANER:
            for word in sent:
                if not KEY_COUNT.has_key(word):
                    KEY_COUNT[word] = 0

                KEY_COUNT[word] += 1
                if KEY_COUNT[word] >= MAX:
                    MAX = KEY_COUNT[word]
                    if word in  MAX_LIST:
                        MAX_LIST.remove(word)
                    MAX_LIST.insert(0, word)
        print '.',
        TOO_FEW = [key for key in list(KEY_COUNT) if KEY_COUNT[key] <= 1]

        TOTAL_KEYS = len(list(KEY_COUNT))

        REMOVE_TOP = np.floor(TOTAL_KEYS / 10.)
        print '.',
        for SCLASS in SENT_CLASS:
            for s in SCLASS:
                if s in MAX_LIST:
                    MAX_LIST.remove(s)
                if s in TOO_FEW:
                    TOO_FEW.remove(s)
        print '.',
        for tf in TOO_FEW:
            if tf in KEYS:
                KEYS.remove(tf)
        print '.',
        TOO_MANY = MAX_LIST

        for tm in TOO_MANY:
            if tm in KEYS:
                KEYS.remove(tm)

        print 'TERMINATED THE UNDESIREABLES'
        print 'REMOVED ', len(TOO_MANY) + len(TOO_FEW), 'OF ', TOTAL_KEYS, 'UNDESIREABLES'


        print 'STARTING STEMMING'
        # removes suffixes and prefixes from words leaving the rootword only
        STEMMER = LancasterStemmer()

        # hash of stemmed words because it's trememndously faster to do it this
        # way versus calling LancasterStemmer every time
        STEM_DICT = { K : STEMMER.stem(K) for K in KEYS }

        print 'STEMMED DICTIONARY CREATED'
        # stem the class labels... and we know theyre in the hash because that
        # was the first thing we added to the key list
        STEM_LABS = [[STEM_DICT[ID] for ID in CLASS] for CLASS in SENT_CLASS]

        # now the dictionary is generated and the vectorization has begun
        DICT = STEM_DICT.values()

        # using the hash table of stemmed words to look up the stemmed root
        STEM_DATA = [[STEM_DICT[word] for word in sent if word in list(STEM_DICT)] for sent in CLEANER]

        # indexs of root words in the dictionary so that you only have to do a few lookups
        # when gnerating the data set of binary vectors
        INT_DATA = [np.array([DICT.index(word) for word in sent],
                                       dtype=np.int32, order='c') for sent in STEM_DATA]
        # same as above
        INT_LABS = [[DICT.index(ID) for ID in CLASS] for CLASS in STEM_LABS]

        print 'CREATING DATA SET'
        print 'I BET THIS TAKES THE LONGSEST'
        # meat and potatoes
        LABS = np.zeros((len(CLEANER), 1), dtype=np.int32, order='c')
        MAT_SHAPE = (1, len(DICT))
        MAT = np.zeros(MAT_SHAPE, dtype=np.int32, order='c')


        # priming for the for loop below so that we can stack each new feature at
        # the bottom of our data set.
        M_IND = np.array(INT_DATA[0], dtype=np.int32, order='c')
        MAT[0, M_IND] = 1

        print 'STILL Go',
        CLIST = range(0, NUM_CLASSES)
        for d in range(NUM_CLASSES):
            print 'i',
            i = CLIST[np.random.randint(0, len(CLIST))]

            IND = np.atleast_2d(INT_LABS[i])

            if np.any(MAT[0, IND] == 1) and LABS[0] == 0:
                LABS[0] = i + 1
            else:
                CLIST.remove(i)


        # makes whole data set
        for i in range(1, np.size(INT_DATA, 0)):

            if np.mod(i, np.floor(np.size(INT_DATA, 0) * .2)) == 0 :
                print 'n',
            NEXT_ARRAY = np.zeros(MAT_SHAPE, dtype=np.int32, order='c')
            TO_ONES = np.array(INT_DATA[i], dtype=np.int32, order='c')
            NEXT_ARRAY[0, TO_ONES] = 1

            # making labels
            CLIST = range(0, NUM_CLASSES)
            for d in range(NUM_CLASSES):

                j = CLIST[np.random.randint(0, len(CLIST))]
                IND = np.atleast_2d(INT_LABS[j])
                if np.any(NEXT_ARRAY[0, IND] == 1) and LABS[i] == 0:
                    LABS[i] = j + 1

                CLIST.remove(j)

            MAT = np.vstack((MAT, NEXT_ARRAY))

        print 'g!'

    print 'I WAS RIGHT'

    # to reduce non-classed features the below algo tries to reduce the number of
    # non-classed features but if max-features is fewer than MAT with all non-classed features
    # removed then steps have to be taken to remove classed features
    TOTAL = np.size(MAT, 0)
    if max_features > TOTAL:
        max_features = TOTAL

    print 'THINNING THE HERD A BIT MORE'
    REMOVALS = range(0, TOTAL_CLASSES)
    if TOTAL > max_features:
        NO_CLASS_CAND = np.argwhere(LABS == 0)
        HAS_CLASS_CAND = np.argwhere(LABS != 0)

        # gets weighted number of features of each class to remove
        if (TOTAL - np.size(NO_CLASS_CAND)) >= max_features:
            HAS_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND) - max_features
            CLASS_FEATURES = np.array([np.sum(HAS_CLASS_CAND == CLASS) for CLASS in range(1, TOTAL_CLASSES)]) * 1.
            SHARED_REMOVE = np.floor(HAS_CLASS_REMOVE * (CLASS_FEATURES / np.sum(CLASS_FEATURES)))
            SHARED_REMOVE = SHARED_REMOVE.tolist()
            NO_CLASS_REMOVE = TOTAL - np.sum(SHARED_REMOVE) - max_features

        # no features removed
        else:
            NO_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND)
            REMOVALS[0] = NO_CLASS_REMOVE

            SHARED_REMOVE = range(0, NUM_CLASSES) * 0

        # removes the number of features determined above
        for c in range(0, TOTAL_CLASSES):
            if REMOVALS[c] != 0:
                for i in range(0, REMOVALS[c]):
                    CANDIDATES = np.argwhere(LABS == c)
                    DRAW = np.random.randint(0, np.size(CANDIDATES))
                    np.delete(LABS, DRAW, 0)
                    np.delete(MAT, DRAW, 0)

    print 'DONE!... where\'d you go???'
   
    
    return MAT, np.ravel(LABS, order='c')

Exemple #23

0

Afficher le fichier

Fichier : aula2.py Projet : rodrigorenie/datascience

texto_e = 'My name is Maximus Decimus Meridius, commander of the armies of ' \
          'the north, General of the Felix legions and loyal servant to the ' \
          'true emperor, Marcus Aurelius. Father to a murdered son, husband ' \
          'to a murdered wife. And I will have my vengeance, in this life or ' \
          'the next (Gladiator, the movie).'

texto_p = 'Meu nome é Maximus Decimus Meridius, comandante dos exércitos do ' \
          'norte, general das legiões de Félix e servo leal ao verdadeiro ' \
          'imperador, Marcus Aurelius. Pai de um filho assassinado, marido ' \
          'de uma esposa assassinada. E eu terei minha vingança, nesta vida ' \
          'ou na próxima (Gladiador, o filme).'

tokens = word_tokenize(texto_e)
tokens = word_tokenize(texto_p)
tokens = ['amor', 'amora', 'amoroso']

porter = PorterStemmer()
stems_porter = [porter.stem(t) for t in tokens]

lancaster = LancasterStemmer()
stems_lancaster = [lancaster.stem(t) for t in tokens]

rslp = RSLPStemmer()
stems_rslp = [rslp.stem(t) for t in tokens]

print('{:12s} {:12s} {:12s} {}'.format('Tokens', 'Porter', 'Lancaster',
                                       'RSLP'))
for t, p, l, r in zip(tokens, stems_porter, stems_lancaster, stems_rslp):
    print('{:12s} {:12s} {:12s} {}'.format(t, p, l, r))

Exemple #24

0

Afficher le fichier

Fichier : GetTitleWordCounts.py Projet : 16373576/Markup-Language-Classification

                                    articleData = json.load(file)

                                    # save content of the json file
                                    tokenTitle = word_tokenize(articleData['title'])

                                    # add word from the tokenized data to create a list of all words for that article
                                    for word in tokenTitle:
                                        # convert all words to lower case to avoid duplicates
                                        word = word.lower()
                                        #  remove the symbol stopwords
                                        for char in word:
                                            if char in symbolStopwords:
                                                word = word.replace(char, "")
                                        # check if the word contains a number or is a stopword
                                        if not any(char.isdigit() for char in word):
                                            if word not in stopwords:
                                                # stem words to avoid duplication by pluralization
                                                word = stem.stem(word)
                                                # if word isn't already in the dict add it
                                                if word not in wordCount:
                                                    wordCount[word] = 1
                                                else:  # else increase the value of that key in the dict
                                                    wordCount[word] += 1

                                except ValueError:
                                    print("JsonDecodeError for file " + articleTitle)
                        with open("C:/Users/caire/Desktop/OutputData/ClassifyArticlesContentandTitle/OutputTitleArticles/" + s + ".txt", 'a', encoding='utf-8') as newFile:
                            newFile.write(str(dict(Counter(wordCount).most_common(10))) + "\n")
                        wordCount.clear()
    print(s + "'s title words counted for each article and added to file")

Exemple #25

0

Afficher le fichier

Fichier : classify.py Projet : jihokwak/mlstudy

#스톱워드제거
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

#스테밍 & 레마타이징
from nltk import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('happiest')

from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
lemma.lemmatize('happiest', 'a')

#BOW(출현횟수에 기반하여 문맥해석이 되지 않음)
import numpy as np
data = np.array([3, 1, 2])
row_pos = np.array([0, 0, 1])
col_pos = np.array([0, 2, 1])
from scipy import sparse
sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))
sparse_coo.toarray()

#예제1

Exemple #26

0

Afficher le fichier

Fichier : stemming.py Projet : apurvnnd/NLP-using-nltk-basics

from nltk import PorterStemmer, LancasterStemmer
import nltk

tokens = nltk.corpus.brown.words(categories=['romance'])

porter = PorterStemmer()
tokens = ['lying']

print(porter.stem(tokens[0]))

lancast = LancasterStemmer()
print(lancast.stem(tokens[0]))

Exemple #27

0

Afficher le fichier

book_fileid = 'romance/marm05.txt'

# Carrega todos os parágrafos do livro, já tokenizados
book_paras = machado.paras(book_fileid)

# A posição 17 é o primeiro parágrafo do primeiro capítulo, portanto:
book_tokens = book_paras[17][0] + book_paras[18][0]

book_stopwords = stopwords.words('portuguese')
book_stopwords += [p for p in string.punctuation]
book_tokens = [
    t.lower() for t in book_tokens if t.lower() not in book_stopwords
]

#
# Executa os Stemmers
#
porter = PorterStemmer()
stems_porter = [porter.stem(t) for t in book_tokens]

lancaster = LancasterStemmer()
stems_lancaster = [lancaster.stem(t) for t in book_tokens]

rslp = RSLPStemmer()
stems_rslp = [rslp.stem(t) for t in book_tokens]

print('{:18s} {:18s} {:18s} {}'.format('Tokens', 'Porter', 'Lancaster',
                                       'RSLP'))
for t, p, l, r in zip(book_tokens, stems_porter, stems_lancaster, stems_rslp):
    print('{:18s} {:18s} {:18s} {}'.format(t, p, l, r))

Exemple #28

0

Afficher le fichier

# stemming
from nltk import PorterStemmer

pst = PorterStemmer()
pst.stem("having")
pst.stem("sudeep")

words_stem = ["give", "giving", "given", "gave"]
for words in words_stem:
    print(words + " :" + pst.stem(words))

from nltk import LancasterStemmer

lnst = LancasterStemmer()
for words in words_stem:
    print(words + " :" + lnst.stem(words))

from nltk import SnowballStemmer

snl = SnowballStemmer("english")
for words in words_stem:
    print(words + " :" + snl.stem(words))

# lemmetizing

from nltk import WordNetLemmatizer
wordnet = WordNetLemmatizer()

for words in words_stem:
    print(words + " :" + wordnet.lemmatize(words))

Exemple #29

0

Afficher le fichier

# In[30]:

# Copy from glove weights of words that appear in index2word
count = 0
for i in range(1, VOCAB_SIZE):
    w = index2word[i]
    g = glove_index_dict.get(w)
    if g is None:
        w = wnl.lemmatize(w)
        g = glove_index_dict.get(w)
    if g is None:
        w = porter.stem(w)
        g = glove_index_dict.get(w)
    if g is None:
        w = lancaster.stem(w)
        g = glove_index_dict.get(w)
    if g is not None:
        embedding[i, :] = glove_embedding_weights[g, :]
        count += 1
print(
    '{num_tokens}-{per:.2f}% tokens in vocab found in glove and copied to embedding.'
    .format(num_tokens=count, per=count / float(VOCAB_SIZE) * 100))

# # Build Dateset

# In[32]:

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Exemple #30

0

Afficher le fichier

        'The interviewee attributed their correct guess to how famous the first phrase is.'
    )
    print(
        'The first four content words were readily recognizable to anyone who has read the book.'
    )
    print('No function words were needed to identify the source.')
    print('\n')
    print('\n')

    print("_" * 70)
    print('QUESTION 3: Stemming and Lemmatization: \n')
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    snowball = SnowballStemmer('english')
    porter_stemming = [porter.stem(w) for w in filtered_words]
    lancaster_stemming = [lancaster.stem(w) for w in filtered_words]
    snowball_stemming = [snowball.stem(w) for w in filtered_words]

    #with wrapping
    format = '%s'
    pieces = [format % (word) for word in porter_stemming]
    output = ', '.join(pieces)
    wrapped_porter = fill(output)

    print('The Normalized, Filtered Text Stemmed with PorterStemmer is: \n')
    print(wrapped_porter)

    #with wrapping
    format = '%s'
    pieces = [format % (word) for word in lancaster_stemming]
    output = ', '.join(pieces)

Exemple #31

0

Afficher le fichier

Fichier : filter_features.py Projet : sophiavanvalkenburg/Masters-Project

#!/usr/bin/python

"""
This script takes tf-idf results and filters just those that are included in the review's feature list
"""

import sys
from nltk import LancasterStemmer

tfidf_fname = sys.argv[1]
features_fname = sys.argv[2]
tfidf_file = open(tfidf_fname)
features_file = open(features_fname)

stemmer = LancasterStemmer() 

stemmed_features = []
for line in features_file:
    cols = line.split(',')
    feature = cols[2]
    stemmed_words = [stemmer.stem(w) for w in feature.split()]
    stemmed_features += stemmed_words

#print stemmed_features

for line in tfidf_file:
    cols = line.split(',')
    word = cols[2]
    if word.strip() in stemmed_features:
        print line.strip()

Exemple #32

0

Afficher le fichier

Fichier : stemmers.py Projet : daidenghui1234/Natural-Language-Processing-with-Python-Cookbook

#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/7/11 17:50
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : stemmers.py
# @Software   : PyCharm
# @Description: 词干提取

from nltk import PorterStemmer, LancasterStemmer, word_tokenize

raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \
      "loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And " \
      "I will have my vengeance, in this life or the next. "
tokens = word_tokenize(raw)  # 根据单词分词
porter = PorterStemmer()  # 相对少去后缀
pStems = [porter.stem(t) for t in tokens]  # 后缀（s es e ed al）
print(pStems)

lancaster = LancasterStemmer()  # 更彻底
lStems = [lancaster.stem(t) for t in tokens]  # 去除单词的大小写和后缀
print(lStems)

Exemple #33

0

Afficher le fichier

Fichier : StemmingAndLemmatization.py Projet : pranabbijoypuri/PythonNLP

import nltk
from nltk.corpus import stopwords
from nltk import LancasterStemmer
from nltk import PorterStemmer

textcontent = "this is an input"
# Write your code here
pattern = r'[A-Za-z0-9]+'
tokenizedwords = nltk.regexp_tokenize(textcontent, pattern)

# print(tokenizedwords)

tokenizedwords = [word.lower() for word in set(tokenizedwords)]

stop_words = set(stopwords.words('english'))
filteredwords = [w for w in tokenizedwords if not w in stop_words]

porter = nltk.PorterStemmer()
porterstemmedwords = [porter.stem(w) for w in filteredwords]

lancaster = LancasterStemmer()
lancasterstemmedwords = [lancaster.stem(w) for w in filteredwords]

wnl = nltk.WordNetLemmatizer()
lemmatizedwords = [wnl.lemmatize(w) for w in filteredwords]

print(porterstemmedwords, lancasterstemmedwords, lemmatizedwords)