Ejemplo n.º 1
0
def process(word_list):
	lancaster=LancasterStemmer()
	new_list=[]
	for word in word_list:
		w=lancaster.stem(word)
		new_list.append(w)
	return new_list
Ejemplo n.º 2
0
def Stem(s):
    if s is not None and isinstance(s, str) and len(s) > 0:
        stemmer = LancasterStemmer()
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s = s.lower()
        return s
    else:
        return ""
Ejemplo n.º 3
0
 def stem_words(self, words):
     """Stem words in list of tokenized words"""
     stemmer = LancasterStemmer()
     stems = ""
     for word in words.split(" "):
         stem = stemmer.stem(word)
         stems = stems + " " + stem
     return stems
Ejemplo n.º 4
0
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Ejemplo n.º 5
0
def main():
    save_data_from_webpage()
    
    text = get_data_from_file()
  
    
    #creates a list of the tolkenized words
    tt = word_tokenize(text)
    pprint(tt)

    #creates a new list for the steam words using all of the stemmers
    psteam = PorterStemmer()
    psteam_list = []
    for word in tt:
        psteam_list.append(psteam.stem(word))
    pprint(psteam_list)

    lsteam = LancasterStemmer()
    lsteam_list = []
    for word in tt:
       lsteam_list.append(lsteam.stem(word))
    pprint(lsteam_list)

    ssteam = SnowballStemmer()
    ssteam_list = []
    for word in tt:
        ssteam_list.append(ssteam.stem(word))
    pprint(ssteam_list)

    p = set(psteam_list)
    l = set(lsteam_list)
    s = set(ssteam_list)
    #displays the different steams
    pprint(s.difference(l.difference(p)))

    #pos taging
    pos_list = pos_tag(text)
    pprint(pos_list)

    #creates a new list for the lematized words
    lemmatizer = WordNetLemmatizer()
    lem = []
    for word in tt:
        lem.append(lemmatizer.lemmatize(word)) 
    #pprint(lem)
    
    # returns a generator of trigrams using the tokenized list tt
    trig = trigrams(tt)
    displays the results
    print(list(trig))
    
    #ne_chunck finds non overlapping groups
    #pos_tag ids how the text is used in speech
    NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text)))
    print(NamedEntity)
Ejemplo n.º 6
0
 def _normalize(self, item):
     key, value = item
     ls = LancasterStemmer()
     text = word_tokenize(value[0])
     text = [word.lower() for word in text]
     text = [
         ls.stem(word).rstrip('s')
         for word in text
         if word not in stopwords.words('english') and word.isalnum()
     ]
     return (key, (text, value[1]))
Ejemplo n.º 7
0
def __stem_document(document_name: pathlib.Path) -> list:
    stemmer = LancasterStemmer()
    with document_name.open('r', encoding='utf-8') as document:
        lines = document.readlines()
    result = []
    for line in lines:
        line = line.strip()
        words = [token for token in line.split(' ')]
        words = [stemmer.stem(word) for word in words]
        sentence = ' '.join(words)
        result.append(sentence)
    return result
Ejemplo n.º 8
0
def get_stems(tokens):
    stemmer = LancasterStemmer()
    stemmed_tokens = []
    for token in tokens:
        for word in token:
            if word[1] == 'DT' or word[1] == 'PRP' or word[1] == 'PRP$' or word[
                    1] == 'NN' or word[1] == 'NNP' or word[1] == 'NNPS':
                temp_tokens = word[0]
            else:
                temp_tokens = stemmer.stem(word[0])
            stemmed_tokens.append(temp_tokens)
    return get_lemma(stemmed_tokens)
Ejemplo n.º 9
0
def getStemsFromURL(page_url):
    '''
    Given the link of a webpage (string), returns a list of 
    all the words' stems in the webpage text
    '''
    with urlopen(page_url) as infile:
        soup = BeautifulSoup(infile, features="lxml")

    ls = LancasterStemmer()
    words = word_tokenize(soup.text)
    words = [w.lower() for w in words]
    words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()]
    return words
Ejemplo n.º 10
0
def checkstemmers():
	raw = customparse("C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt")
	wordz = raw.split(" ")
	O = ["sweating","tripping","gunning","going"] 
	HH = [i[0:-1] for i in O] 
	dic = enchant.Dict("en_US") 
	from nltk import LancasterStemmer, PorterStemmer
	lancaster = LancasterStemmer()
	porter = PorterStemmer()
	resporter = [porter.stem(t).replace(" ","") for t in wordz] 
	reslan = [lancaster.stem(t).replace(" ","") for t in wordz]
	resall = [[wordz[i],resporter[i],reslan[i]]  for i in range(len(wordz)) ]
	filtres = [resall[i] for i in range(len(resall)) if not (resall[i][0]==resall[i][2]==resall[i][1])]
	return resall
Ejemplo n.º 11
0
def _create_stemmer(stemmer_type):
    """ Initialize a stemmer """
    return {
        'Porter': PorterStemmer(),
        'Snowball': SnowballStemmer('english'),
        'Lancaster': LancasterStemmer(),
    }[stemmer_type]
Ejemplo n.º 12
0
def words_stemmer(words,
                  type="PorterStemmer",
                  lang="english",
                  encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "LancasterStemmer", "SnowballStemmer"
    ]
    if type is False or type not in supported_stemmers:
        return words
    else:
        stem_words = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))

        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))

        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))
        return " ".join(stem_words)
Ejemplo n.º 13
0
    def clean_tweets(self, text):
        st = LancasterStemmer()
        #st = PorterStemmer()
        with open('newspaper3k/SmartStoplist.txt', 'r') as f:
            stopwords = [line.strip() for line in f]

            # remove URL's
            text = re.sub(r'http\S+', '', text)
            tweet_tmp = text.split("\n")
            for k in tweet_tmp:
                tweet_tmp = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower()
                tweet_tmp = st.stem(tweet_tmp)
            tweet_tmp = ''.join([i for i in tweet_tmp if not i.isdigit()])
            tweet_tmp = tweet_tmp.split()
            result = [word for word in tweet_tmp if word not in stopwords]
            return result
Ejemplo n.º 14
0
def checkstemmers():
    raw = customparse(
        "C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt"
    )
    wordz = raw.split(" ")
    O = ["sweating", "tripping", "gunning", "going"]
    HH = [i[0:-1] for i in O]
    dic = enchant.Dict("en_US")
    from nltk import LancasterStemmer, PorterStemmer
    lancaster = LancasterStemmer()
    porter = PorterStemmer()
    resporter = [porter.stem(t).replace(" ", "") for t in wordz]
    reslan = [lancaster.stem(t).replace(" ", "") for t in wordz]
    resall = [[wordz[i], resporter[i], reslan[i]] for i in range(len(wordz))]
    filtres = [
        resall[i] for i in range(len(resall))
        if not (resall[i][0] == resall[i][2] == resall[i][1])
    ]
    return resall
Ejemplo n.º 15
0
def getMostUsedWordsTxt(file, wordnum):
    '''
    Given a text file name (string) and the number of most
    used words we want to find (int), returns a list of the wordnum
    most common elements and their counts from the most common
    to the least:
    [('1st_most_common_word', count1), 
    ('2nd_most_common_word', count2), 
    ...,
    ('wordnumth_most_common_word', countwordnum)]
    '''
    with open(file, "r") as f:
        words = f.read()
        words = words.split()

    ls = LancasterStemmer()
    words = [w.lower() for w in words]
    words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()]
    freqs = Counter(words)
    return freqs.most_common(wordnum)
Ejemplo n.º 16
0
    def tokenize(self, description):

        filtered = []
        # dont process NaN or Null values
        if pd.isnull(description):
            return filtered, filtered
        else:
            terms = description.lower().split()
            # terms = word_tokenize(description.lower().decode('utf-8'))
            filtered_stopwords = [word for word in terms if not word in stopwords.words('english')]

            # # Stemming Snowball
            # stemmer = SnowballStemmer('english')
            # for stem in filtered_stopwords:
            #     filtered.append(stemmer.stem(stem.decode('utf-8')))

            # # Stemming Porter
            # stemmer = PorterStemmer()
            # for stem in filtered_stopwords:
            #     filtered.append(stemmer.stem(stem.decode('utf-8')))

            # Lemmatizer Word Net Lemmatizer
            lemmatizer = WordNetLemmatizer()
            for lemmatized in filtered_stopwords:
                filtered.append(lemmatizer.lemmatize(lemmatized))

            filtered_final = []
            # Stemming Lancaster
            stemmer = LancasterStemmer()
            for stem in filtered:
                # filtered_final.append(stemmer.stem(stem.decode('utf-8')))
                filtered_final.append(stemmer.stem(stem))

            # # Lemmatizer TextBlob
            # for lemmatized in filtered_stopwords:
            #     w = Word(lemmatized.decode('utf-8'))
            #     filtered.append(w.lemmatize)

            return filtered_final
def get_words_from_string(string):
    string = string.lower()
    word_pattern = r'[A-Za-z]+'
    # link_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"
    # email_pattern = r"\S+@\S+"
    # ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
    result = []
    # for x in re.findall(link_pattern, string):
    #     try:
    #         url = "{0.scheme}://{0.netloc}/".format(urlsplit(x))
    #     except:
    #         url = x
    #     result.append(url)
    # string = re.sub(link_pattern, "", string)
    # result.extend(re.findall(email_pattern, string))
    # string = re.sub(email_pattern, "", string)
    # result.extend(re.findall(ip_pattern, string))
    # string = re.sub(ip_pattern, "", string)
    # stemmer = PorterStemmer()
    stemmer = LancasterStemmer()
    result.extend(
        [stemmer.stem(word) for word in re.findall(word_pattern, string)])
    # result.extend(re.findall(word_pattern, string))
    return result
Ejemplo n.º 18
0
from nltk import WordNetLemmatizer, PorterStemmer, LancasterStemmer

# In[28]:

# Generate random embedding with same scale as glove
np.random.seed(SEED)
shape = (VOCAB_SIZE, EMBEDDING_SIZE)
scale = glove_embedding_weights.std() * np.sqrt(12) / 2
embedding = np.random.uniform(low=-scale, high=scale, size=shape)

# In[29]:

wnl = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

# In[30]:

# Copy from glove weights of words that appear in index2word
count = 0
for i in range(1, VOCAB_SIZE):
    w = index2word[i]
    g = glove_index_dict.get(w)
    if g is None:
        w = wnl.lemmatize(w)
        g = glove_index_dict.get(w)
    if g is None:
        w = porter.stem(w)
        g = glove_index_dict.get(w)
    if g is None:
Ejemplo n.º 19
0
import unidecode
from nltk import LancasterStemmer
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity, stopwords
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.tokenize import TweetTokenizer

# NLTK stuff
tweet_tokenizer = TweetTokenizer()
stopwords = sorted(stopwords.words('spanish') + ['rt'])
stemmer = LancasterStemmer()

# Regex stuff
regex_url = re.compile(
    '((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?'
)
regex_ht_mn = re.compile('(#|@)[\w]*')
regex_spaces = re.compile('[ ]+')
regex_nonword = re.compile('[\W]+')
regex_repeated_ch = re.compile(r'(\w*)(\w)\2(\w*)')
regex_ch = r'\1\2\3'


# Feature stuff
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words
Ejemplo n.º 20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Python for AHDA.

Part 5, Example 8.
"""

# Lemmatize words

from nltk import LancasterStemmer
from nltk import PorterStemmer

print('LancasterStemmer')
print(LancasterStemmer().stem('lying'))
print(LancasterStemmer().stem('lie'))
print()
print('PorterStemmer')
print(PorterStemmer().stem('lying'))
print(PorterStemmer().stem('lie'))
Ejemplo n.º 21
0
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer

pStemmer = PorterStemmer()
lStemmer = LancasterStemmer()
sStemmer = SnowballStemmer('english')

print(pStemmer.stem("Playing"))
print(lStemmer.stem("Dancing"))
print(sStemmer.stem("Killing"))

from nltk.stem import WordNetLemmatizer

lemmetizer = WordNetLemmatizer()
print(lemmetizer.lemmatize("Playing"))
print(lemmetizer.lemmatize("Dancing"))
print(lemmetizer.lemmatize("Killing"))
print(lemmetizer.lemmatize("geese"))

from nltk import wordpunct_tokenize, pos_tag, ne_chunk

sentence = "Mark and John are working at google"
print(wordpunct_tokenize(sentence), '\n')
print(pos_tag(wordpunct_tokenize(sentence)), '\n')
print(ne_chunk(pos_tag(wordpunct_tokenize(sentence))))
Ejemplo n.º 22
0
arquivo = 'C:\\Users\\Usuario\\Dropbox\\Pos\\Pós DataScience\\4 - Análise de textos com R e Python\\Dados\\Noticia_2.docx'
doc = docx.Document(arquivo)

# 02 - Lista de parágrafos
texto_full = []
for paragrafo in doc.paragraphs:
    texto_full.append(paragrafo.text)

# Seleção do 2° e 3° parágrafo
p_2e3 = texto_full[2:4]

# Tokenizar a lista de paragrafos
tokens = word_tokenize(' '.join(p_2e3))

## RSLP
rslp = RSLPStemmer()
stemms_rslp = []
for i in tokens:
    stemms_rslp.append(rslp.stem(i))

## Poter
poter = PorterStemmer()
stemms_poter = []
for i in tokens:
    stemms_poter.append(poter.stem(i))

## Lancaster
lancaster = LancasterStemmer()
stemms_lanc = []
for i in tokens:
    stemms_lanc.append(lancaster.stem(i))
Ejemplo n.º 23
0
def stem(array, word):
    stemmed = LancasterStemmer().stem(word)
    array.remove(word)
    array.append(stemmed)
Ejemplo n.º 24
0
from nltk import PorterStemmer, LancasterStemmer, word_tokenize

line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\
                          " Software industry working \nfrom applications to products by using \n" \
                          " C, C++, Java, Javascript and databases "\
                          " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB."

tokens = word_tokenize(line)
porter = PorterStemmer()
pStems = [porter.stem(t) for t in tokens]
print(pStems)

print("************************************************")

lancaster = LancasterStemmer()
lStems = [lancaster.stem(t) for t in tokens]
print(lStems)



#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/7/11 17:50
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : stemmers.py
# @Software   : PyCharm
# @Description: 词干提取

from nltk import PorterStemmer, LancasterStemmer, word_tokenize

raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \
      "loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And " \
      "I will have my vengeance, in this life or the next. "
tokens = word_tokenize(raw)  # 根据单词分词
porter = PorterStemmer()  # 相对少去后缀
pStems = [porter.stem(t) for t in tokens]  # 后缀(s es e ed al)
print(pStems)

lancaster = LancasterStemmer()  # 更彻底
lStems = [lancaster.stem(t) for t in tokens]  # 去除单词的大小写和后缀
print(lStems)
Ejemplo n.º 26
0
    'page': TITLE,
    'format': "json"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

# get the text
wiki_page_text = DATA["parse"]["text"]["*"]

h = html2text.HTML2Text()
h.ignore_links = True
page_text = h.handle(wiki_page_text)

# create a new stemmer
ls = LancasterStemmer()

# tokenize text
words = nltk.word_tokenize(page_text)

words = [w.lower() for w in words]

# eliminate stop words and stem the rest of the words
words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]

freqs = Counter(words)

print("The 10 most frequently used stems in the ''Data science'' Wikipedia page are:")
for word, count in freqs.most_common(10):
    print(word, count)
import json
import os
from collections import Counter

from nltk import LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# define variables
articles_path = []
articleTitles = []
articleData = []
wordCount = {}
tokenTitle = []
summaryAllArticles = {}
stem = LancasterStemmer()


#  a subset of all sources for the articles in the NELA2017 dataset
sources = ["AP", "BBC", "PBS", "Salon", "Slate", "The New York Times", "BuzzFeed", "Drudge Report", "Faking News", "RedState",
           "The Gateway Pundit", "The Huffington Post"]

# second subset sources used to determine if the results so far are dependent on the current sources being used
# sources = ["CNN", "MotherJones", "NPR", "PBS", "The Hill", "Vox", "Addicting Info", "New York Daily News", "Prntly",
#            "The D.C. Clothesline", "The Duran", "Yahoo News"]

#  set of commonly used words such as "the", "a", "in" etc.
englishStopWords = set(stopwords.words('english'))
symbolStopwords = (
    {":", ";", "'", '"', '”', '“', ",", ".", "-", "_", "?", "$", "&", '...', '.', '�', '!', "''", "``", "%", "@", "--",
     ")", "(", "[", "]", "[]", "[ ]", "’", "|", "‘", " ", "'s", 'mr', 'mrs', 'one', 'two', 'said', 'hi', 'say', "n't",
Ejemplo n.º 28
0
def MakeFeaturesFromText(DIR, FNAME, SENT_CLASS, max_features):
    '''
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    EXPERIMENTAL:     Still trying to figure out if it produces good
    Data sets. Use at your own risk
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    Grabs a text file and creates a sparse binary data set to training and
    makes class labels based off of the class identifiers in SENT_CLASS

    ARG: DIR          TYPE: string                 DESC: Directory where text file is saved. End with \
    ARG: FNAME          TYPE: string                 DESC: File name
    ARG: SENT_CLASS TYPE: List (of lists)     DESC: A list of lists with each internal list representing a class id
    '''

    print 'STARTING DATA GENERATION'

    from nltk import download as nldl
    from nltk import LancasterStemmer
    import nltk.corpus as corp
    import string

    # used for referencing later
    NUM_CLASSES = len(SENT_CLASS)
    TOTAL_CLASSES = NUM_CLASSES + 1

    # read file and begin cleaning
    TextFile = open(DIR + FNAME, 'r')
    RAW = TextFile.readlines()
    # removes newline, makes it all lower case, and splits it into seperable words by spaces
    CLEAN = [sent[:-1].lower().split(' ') for sent in RAW]

    print 'LOADING STOPWORDS'

    # need the try, catch because it requires the stop words from the nltk database
    # the catch is just for first time running
    try:
        STOP_WORDS = corp.stopwords.words('english')
    except LookupError:
        nldl('words')
        STOP_WORDS = corp.stopwords.words('english')
    finally:
        # stop words are common words like "the", "an" "a" etc..
        STOP_WORDS += ' '.join(p for p in string.punctuation).split(' ') + ' '.join(d for d in string.digits).split(' ') + ' '.join(w for w in string.whitespace).split(' ')
        # removes stopwords and trivially short features
        CLEANER = [[word for word in sent if word not in STOP_WORDS] for sent in CLEAN if len(sent) > 3]
        KEYS = []

        print 'SORTING KEYS'
        # starts compiliing the key list from the id words in sent_class
        for IDS in SENT_CLASS:
            KEYS += IDS

        print 'HEAVY DUTY STUFF'
        # finds all unique words. keeps combining and fitlering duplicates
        for sent in CLEANER:
            TEMP = KEYS + sent
            KEYS = np.unique(TEMP).tolist()

        print 'COUNTING NCOs. . ',
        MAX_LIST = []
        MAX = -1
        KEY_COUNT = {}
        for sent in CLEANER:
            for word in sent:
                if not KEY_COUNT.has_key(word):
                    KEY_COUNT[word] = 0

                KEY_COUNT[word] += 1
                if KEY_COUNT[word] >= MAX:
                    MAX = KEY_COUNT[word]
                    if word in  MAX_LIST:
                        MAX_LIST.remove(word)
                    MAX_LIST.insert(0, word)
        print '.',
        TOO_FEW = [key for key in list(KEY_COUNT) if KEY_COUNT[key] <= 1]

        TOTAL_KEYS = len(list(KEY_COUNT))

        REMOVE_TOP = np.floor(TOTAL_KEYS / 10.)
        print '.',
        for SCLASS in SENT_CLASS:
            for s in SCLASS:
                if s in MAX_LIST:
                    MAX_LIST.remove(s)
                if s in TOO_FEW:
                    TOO_FEW.remove(s)
        print '.',
        for tf in TOO_FEW:
            if tf in KEYS:
                KEYS.remove(tf)
        print '.',
        TOO_MANY = MAX_LIST

        for tm in TOO_MANY:
            if tm in KEYS:
                KEYS.remove(tm)

        print 'TERMINATED THE UNDESIREABLES'
        print 'REMOVED ', len(TOO_MANY) + len(TOO_FEW), 'OF ', TOTAL_KEYS, 'UNDESIREABLES'


        print 'STARTING STEMMING'
        # removes suffixes and prefixes from words leaving the rootword only
        STEMMER = LancasterStemmer()

        # hash of stemmed words because it's trememndously faster to do it this
        # way versus calling LancasterStemmer every time
        STEM_DICT = { K : STEMMER.stem(K) for K in KEYS }

        print 'STEMMED DICTIONARY CREATED'
        # stem the class labels... and we know theyre in the hash because that
        # was the first thing we added to the key list
        STEM_LABS = [[STEM_DICT[ID] for ID in CLASS] for CLASS in SENT_CLASS]

        # now the dictionary is generated and the vectorization has begun
        DICT = STEM_DICT.values()

        # using the hash table of stemmed words to look up the stemmed root
        STEM_DATA = [[STEM_DICT[word] for word in sent if word in list(STEM_DICT)] for sent in CLEANER]

        # indexs of root words in the dictionary so that you only have to do a few lookups
        # when gnerating the data set of binary vectors
        INT_DATA = [np.array([DICT.index(word) for word in sent],
                                       dtype=np.int32, order='c') for sent in STEM_DATA]
        # same as above
        INT_LABS = [[DICT.index(ID) for ID in CLASS] for CLASS in STEM_LABS]

        print 'CREATING DATA SET'
        print 'I BET THIS TAKES THE LONGSEST'
        # meat and potatoes
        LABS = np.zeros((len(CLEANER), 1), dtype=np.int32, order='c')
        MAT_SHAPE = (1, len(DICT))
        MAT = np.zeros(MAT_SHAPE, dtype=np.int32, order='c')


        # priming for the for loop below so that we can stack each new feature at
        # the bottom of our data set.
        M_IND = np.array(INT_DATA[0], dtype=np.int32, order='c')
        MAT[0, M_IND] = 1

        print 'STILL Go',
        CLIST = range(0, NUM_CLASSES)
        for d in range(NUM_CLASSES):
            print 'i',
            i = CLIST[np.random.randint(0, len(CLIST))]

            IND = np.atleast_2d(INT_LABS[i])

            if np.any(MAT[0, IND] == 1) and LABS[0] == 0:
                LABS[0] = i + 1
            else:
                CLIST.remove(i)


        # makes whole data set
        for i in range(1, np.size(INT_DATA, 0)):

            if np.mod(i, np.floor(np.size(INT_DATA, 0) * .2)) == 0 :
                print 'n',
            NEXT_ARRAY = np.zeros(MAT_SHAPE, dtype=np.int32, order='c')
            TO_ONES = np.array(INT_DATA[i], dtype=np.int32, order='c')
            NEXT_ARRAY[0, TO_ONES] = 1

            # making labels
            CLIST = range(0, NUM_CLASSES)
            for d in range(NUM_CLASSES):

                j = CLIST[np.random.randint(0, len(CLIST))]
                IND = np.atleast_2d(INT_LABS[j])
                if np.any(NEXT_ARRAY[0, IND] == 1) and LABS[i] == 0:
                    LABS[i] = j + 1

                CLIST.remove(j)

            MAT = np.vstack((MAT, NEXT_ARRAY))

        print 'g!'

    print 'I WAS RIGHT'

    # to reduce non-classed features the below algo tries to reduce the number of
    # non-classed features but if max-features is fewer than MAT with all non-classed features
    # removed then steps have to be taken to remove classed features
    TOTAL = np.size(MAT, 0)
    if max_features > TOTAL:
        max_features = TOTAL

    print 'THINNING THE HERD A BIT MORE'
    REMOVALS = range(0, TOTAL_CLASSES)
    if TOTAL > max_features:
        NO_CLASS_CAND = np.argwhere(LABS == 0)
        HAS_CLASS_CAND = np.argwhere(LABS != 0)

        # gets weighted number of features of each class to remove
        if (TOTAL - np.size(NO_CLASS_CAND)) >= max_features:
            HAS_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND) - max_features
            CLASS_FEATURES = np.array([np.sum(HAS_CLASS_CAND == CLASS) for CLASS in range(1, TOTAL_CLASSES)]) * 1.
            SHARED_REMOVE = np.floor(HAS_CLASS_REMOVE * (CLASS_FEATURES / np.sum(CLASS_FEATURES)))
            SHARED_REMOVE = SHARED_REMOVE.tolist()
            NO_CLASS_REMOVE = TOTAL - np.sum(SHARED_REMOVE) - max_features

        # no features removed
        else:
            NO_CLASS_REMOVE = TOTAL - np.size(NO_CLASS_CAND)
            REMOVALS[0] = NO_CLASS_REMOVE

            SHARED_REMOVE = range(0, NUM_CLASSES) * 0

        # removes the number of features determined above
        for c in range(0, TOTAL_CLASSES):
            if REMOVALS[c] != 0:
                for i in range(0, REMOVALS[c]):
                    CANDIDATES = np.argwhere(LABS == c)
                    DRAW = np.random.randint(0, np.size(CANDIDATES))
                    np.delete(LABS, DRAW, 0)
                    np.delete(MAT, DRAW, 0)

    print 'DONE!... where\'d you go???'
   
    
    return MAT, np.ravel(LABS, order='c')
Ejemplo n.º 29
0
import re
import logging

from nltk import WordNetLemmatizer, LancasterStemmer

from django.core.urlresolvers import reverse

logger = logging.getLogger(__name__)
wordnet_lemmatizer = WordNetLemmatizer()
lancaster_stemmer = LancasterStemmer()


def extract_keywords(title):
    original_keywords = [keyword.lower() for keyword in re.split('\W+', title)]

    try:
        lemmatized_keywords = map(wordnet_lemmatizer.lemmatize,
                                  original_keywords)
    except LookupError:
        logging.error('Please install corpora/wordnet dictionary')
        return []

    stemmed_keywords = map(lancaster_stemmer.stem, original_keywords)

    return list(set(original_keywords + lemmatized_keywords +
                    stemmed_keywords))


def reverse_tastypie_url(resource_name, pk=None):
    """
    Returns tastypie url
Ejemplo n.º 30
0
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import math  # untuk operasi matematika lanjutan

app = Flask(__name__)

# ----------KONFIGURASI DATABASE DOKUMEN----------
db = mysql.connector.connect(host="localhost",
                             user="******",
                             passwd="",
                             database="stki")
cursor = db.cursor()

# buat variabel tempat stopwords
stop_words = set(stopwords.words('english'))
lancaster = LancasterStemmer()  # Lancaster/Paice-Husk Stemmer
eliminasi = [
    '.', '?', '!', ' ', ',', ':', ';', '(', ')', '\'', '"', '%', '&', '*', '-',
    '_', '+', '=', '{', '}', '[', ']', '\\', '|', '"', '<', '>', '/', '0', '1',
    '2', '3', '4', '5', '6', '7', '8', '9', '�'
]


def preProcessDoc(docs):
    docs_token = word_tokenize(docs)
    arr = []
    for i in range(len(docs_token)):
        docs_token[i] = docs_token[i].lower()
        if docs_token[i] not in stop_words:
            skip = 0
            for j in range(len(docs_token[i])):
Ejemplo n.º 31
0
提取文本数据的词干

三种词干提取算法,Lancaster词干提取器比其他两个词干提取器更严格
严格程度而言:Porter最轻松,Lancaster最严格。
Lancaster速度快但是会减少单词的很大部分,通常会选择Snowball词干提取器
'''
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer

words = [
    'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches',
    'grounded', 'dreamt', 'envision'
]

stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL']

stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

formatted_row = '{:>16}' * (len(stemmers) + 1)
print(formatted_row.format('WORD', *stemmers))

for word in words:
    stemmed_words = [
        stemmer_porter.stem(word),
        stemmer_lancaster.stem(word),
        stemmer_snowball.stem(word)
    ]
    print(formatted_row.format(word, *stemmed_words))
Ejemplo n.º 32
0
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

pStemmer = PorterStemmer()
lStemmer = LancasterStemmer()
sStemmer = SnowballStemmer('english')
lemmetizer = WordNetLemmatizer()


def stem_each_word(tokens, lancaster_file, porter_file, snowball_file,
                   lemmetizer_file, trigrams_file):
    lancaster_file_out = open(lancaster_file, "a+")
    porter_file_out = open(porter_file, "a+")
    snowball_file_out = open(snowball_file, "a+")
    lemmetizer_file_out = open(lemmetizer_file, "a+")
    trigrams_file_out = open(trigrams_file, "a+")
    for token in tokens:
        porter_file_out.write(str(pStemmer.stem(token)) + "\t")
        lancaster_file_out.write(str(lStemmer.stem(token)) + "\t")
        snowball_file_out.write(str(sStemmer.stem(token)) + "\t")
        lemmetizer_file_out.write(str(lemmetizer.lemmatize(token)) + "\t")
    trigrams_file_out.write(str(list(ngrams(tokens, 3))))
    porter_file_out.write("\n")
    lancaster_file_out.write("\n")
    snowball_file_out.write("\n")
    lemmetizer_file_out.write("\n")
    trigrams_file_out.write("\n")
#!/usr/bin/python

"""
This script takes tf-idf results and filters just those that are included in the review's feature list
"""

import sys
from nltk import LancasterStemmer

tfidf_fname = sys.argv[1]
features_fname = sys.argv[2]
tfidf_file = open(tfidf_fname)
features_file = open(features_fname)

stemmer = LancasterStemmer() 

stemmed_features = []
for line in features_file:
    cols = line.split(',')
    feature = cols[2]
    stemmed_words = [stemmer.stem(w) for w in feature.split()]
    stemmed_features += stemmed_words

#print stemmed_features

for line in tfidf_file:
    cols = line.split(',')
    word = cols[2]
    if word.strip() in stemmed_features:
        print line.strip()