コード例 #1
0
    def __init__(self, datareader, stopwords,norm,work,split, skip_words,
                 date,porter,porter2,lanca,lanca2):
        """
        :param datareader: a Datareader object
        :param stopwords: a list of stopwords
        """
        self.norm = norm
        self.work = work
        self.split = split
        self.skip_words = skip_words
        self.date = date

        self.porter = porter
        self.porter2 = porter2
        self.lanca = lanca
        self.lanca2 = lanca2

        self.ps = stem.PorterStemmer()
        self.ls = stem.LancasterStemmer()

        train_playlists_df = datareader.get_df_train_playlists()
        test_playlists_df = datareader.get_df_test_playlists()
        concat_df = pd.concat([train_playlists_df, test_playlists_df])
        concat_df = concat_df.sort_values(['pid'], ascending=True)

        self.stopwords = stopwords
        self.titles = concat_df['name'].as_matrix()
        self.tokens_dict = dict()

        self.__set_params()
        self.words = list(self.tokens_dict.keys())
コード例 #2
0
ファイル: knock72.py プロジェクト: ise-ab/100knock2018
def sosei():
    stemmer2 = stem.LancasterStemmer()
    ids = defaultdict(lambda: len(ids))
    for line in open("sentiment.txt").readlines():
        line = line.split()
        for word in line.pop(0):
            ids[stemmer2.stem(word)]
    stop = []
    for line in open("stop.txt","r"):
        stop.append(line.strip())
    sosei_list = []
    label_list = []
    for line in open("sentiment.txt","r"):
        line = line.split()
        label = line[0]
        line = line.pop(0)
        line2 = copy.deepcopy(line)
        for word in line:
            if stop_check(word,stop):
                line2.remove(word)
        line = [0]*len(ids)
        for word in line2:
            line[ids[stemmer2.stem(word)]] += 1
        sosei_list.append(line)
        label_list.append(label)
    return label_list,sosei_list
コード例 #3
0
def stem_token(word):
    stem_token = ""
    if stemmer_name == "Porter-Stemmer":
        #print ("Performing Porter Stemming")
        stemmer = stem.PorterStemmer()
        phrase_array_token = word.split()
        stem_token = ""
        for s in phrase_array_token:
            stem_token = stem_token + stemmer.stem(s) + " "
        stem_token = stem_token.strip(" ")
        word = stem_token
    elif stemmer_name == "Lancaster-Stemmer":
        #print ("Performing Lancaster Stemming")
        stemmer = stem.LancasterStemmer()
        phrase_array_token = word.split()
        stem_token = ""
        for s in phrase_array_token:
            stem_token = stem_token + stemmer.stem(s) + " "
        stem_token = stem_token.strip(" ")
        word = stem_token
    elif stemmer_name == "WordNet-Lemmatizer":
        #print ("Performing Wordnet Lemmatization")
        stemmer = stem.WordNetLemmatizer()
        phrase_array_token = word.split()
        stem_token = ""
        for s in phrase_array_token:
            stem_token = stem_token + stemmer.lemmatize(s) + " "
        stem_token = stem_token.strip(" ")
        word = stem_token
        #stopword[count]=stemmer.lemmatize(stopword[count])
    return (word)
コード例 #4
0
def demo():
    """A demonstration of the lancaster stemmer on a samples described in
    Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
    """
    from nltk import stem

    stemmer = stem.LancasterStemmer()

    print "%-20s%-20s" % ("Original Word", "Stemmed Word")
    print "*" * 40
        
    for word in (
        'maximum',    # Remove "-um" when word is intact
        'presumably', # Don't remove "-um" when word is not intact
        'multiply',   # No action taken if word ends with "-ply" 
        'provision',  # Replace "-sion" with "-j" to trigger "j" set of rules
        'owed',       # Word starting with vowel must contain at least 2 letters
        'ear',        # ditto.
        'saying',     # Words starting with consonant must contain at least 3 
        'crying',     #     letters and one of those letters must be a vowel
        'string',     # ditto.
        'meant',      # ditto.
        'cement'):    # ditto.
        stemmed_word = stemmer.stem(word)
        print "%-20s%-20s" % (word, stemmed_word)
コード例 #5
0
ファイル: knock72.py プロジェクト: tmu-nlp/100knock2016
def getFeature(word_list):
    stemmer = stem.LancasterStemmer()
    # stemmer2 = stem.PorterStemmer()
    feature = defaultdict(lambda: 0)
    for word in word_list:
        if not isStopWords(word):
            word_stem = stemmer.stem(word)
            feature[word_stem] += 1
    return dict(feature)
コード例 #6
0
ファイル: knock72.py プロジェクト: tmu-nlp/100knock2017
def preprocessor_words(words):
    stopwords_set = set(stopwords.words('english'))
    stemmer = stem.LancasterStemmer()

    words_preprocessed = []
    for word in words:
        if word in stopwords_set:
            continue
        lemmatized = stemmer.stem(word)
        words_preprocessed.append(lemmatized)

    return words_preprocessed
コード例 #7
0
ファイル: knock72.py プロジェクト: ise-ab/100knock2018
def preprocessor(input):
    stopwords_set = set(stopwords.words('english'))
    stemmer = stem.LancasterStemmer()

    preprocessed_list = []
    for word in input.lower().split():
        if word in stopwords_set:
            continue
        else:
            lemmatized = stemmer.stem(word)
            preprocessed_list.append(lemmatized)

    return ' '.join(preprocessed_list)
コード例 #8
0
def stem_token(token):
    root_word = token
    if stemmer_name == "Porter-Stemmer":
        #print ("Performing Porter Stemming")
        stemmer = stem.PorterStemmer()
        token = stemmer.stem(token)
    elif stemmer_name == "Lancaster-Stemmer":
        #print ("Performing Lancaster Stemming")
        stemmer = stem.LancasterStemmer()
        token = stemmer.stem(token)
    elif stemmer_name == "WordNet-Lemmatizer":
        #print ("Performing Wordnet Lemmatization")
        stemmer = WordNetLemmatizer()
        token = stemmer.lemmatize(token)
    stem_to_root[token] = root_word
    return (token)
コード例 #9
0
    def text_to_word_list(self, text):
        ''' Pre process and convert texts to a list of words '''
        text = str(text)
        text = text.lower()
        text = sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = sub(r"what's", "what is ", text)
        text = sub(r"\'s", " ", text)
        text = sub(r"\'ve", " have ", text)
        text = sub(r"can't", "cannot ", text)
        text = sub(r"n't", " not ", text)
        text = sub(r"i'm", "i am ", text)
        text = sub(r"\'re", " are ", text)
        text = sub(r"\'d", " would ", text)
        text = sub(r"\'ll", " will ", text)
        text = sub(r",", " ", text)
        text = sub(r"\.", " ", text)
        text = sub(r"!", " ! ", text)
        text = sub(r"\/", " ", text)
        text = sub(r"\^", " ^ ", text)
        text = sub(r"\+", " + ", text)
        text = sub(r"\-", " - ", text)
        text = sub(r"\=", " = ", text)
        text = sub(r"'", " ", text)
        text = sub(r"(\d+)(k)", r"\g<1>000", text)
        text = sub(r":", " : ", text)
        text = sub(r" e g ", " eg ", text)
        text = sub(r" b g ", " bg ", text)
        text = sub(r" u s ", " american ", text)
        text = sub(r"\0s", "0", text)
        text = sub(r" 9 11 ", "911", text)
        text = sub(r"e - mail", "email", text)
        text = sub(r"j k", "jk", text)
        text = sub(r"\s{2,}", " ", text)
        text = word_tokenize(text)
        normalized_sentence = []
        lancaster = stem.LancasterStemmer()
        lemmatizer = WordNetLemmatizer()

        for word in text:
            if self.normalizer == 'lancaster':
                normalized_sentence.append(lancaster.stem(word))
            elif self.normalizer == 'wordnet':
                normalized_sentence.append(lemmatizer.lemmatize(word))
            else:
                normalized_sentence.append(word)
        return normalized_sentence
コード例 #10
0
ファイル: grams.py プロジェクト: ramyananth/concept-graphs
def stem_token(stemmer_name, stopword):
    if stemmer_name == "Porter-Stemmer":
        #print ("Performing Porter Stemming")
        stemmer = stem.PorterStemmer()
        for count in range(len(stopword)):
            stopword[count] = stemmer.stem(stopword[count])
    elif stemmer_name == "Lancaster-Stemmer":
        #print ("Performing Lancaster Stemming")
        stemmer = stem.LancasterStemmer()
        for count in range(len(stopword)):
            stopword[count] = stemmer.stem(stopword[count])
    elif stemmer_name == "WordNet-Lemmatizer":
        #print ("Performing Wordnet Lemmatization")
        stemmer = WordNetLemmatizer()
        for count in range(len(stopword)):
            stopword[count] = stemmer.lemmatize(stopword[count])
    return (stopword)
コード例 #11
0
 def stemText(self, text, intensity):
     """Apply stemming to a string according to :intesity."""
     #select nltk stemmer
     if intensity is 'light':
         s = stem.PorterStemmer()
     elif intensity is 'medium':
         s = stem.snowball.EnglishStemmer()
     elif intensity is 'heavy':
         s = stem.LancasterStemmer()
     else:
         raise Exception(
             "'{0}' is not a correct intensity parameter. Must be light, medium or heavy."
             .format(intensity))
     bow = text.split(" ")  #this creates a bag of words
     result = []
     for word in bow:
         result.append(s.stem(word))
     return ' '.join(result)
コード例 #12
0
    def __init__(self, datareader, stopwords=[]):
        """
        :param datareader: a Datareader object
        :param stopwords: a list of stopwords
        """
        self.stopwords = stopwords
        self.ps = stem.PorterStemmer()
        self.ls = stem.LancasterStemmer()

        train_playlists_df = datareader.get_df_train_playlists()
        test_playlists_df = datareader.get_df_test_playlists()

        concat_df = pd.concat([train_playlists_df, test_playlists_df])

        if datareader.offline():
            concat_df = concat_df.sort_values(['pid'], ascending=True)

        self.playlists = concat_df['pid'].values
        self.titles = concat_df['name'].values
        self.tokens_dict = dict()

        self.__set_params()
        self.words = list(self.tokens_dict.keys())
コード例 #13
0
ファイル: knock72.py プロジェクト: tmu-nlp/100knock2017
def preprocessor_data(data, ids, test=0):
    stopwords_set = set(stopwords.words('english'))
    stemmer = stem.LancasterStemmer()

    data_in_preprocessed = []
    labels = []

    for line in data:
        words_preprocessed = []
        line.lower()
        label, words = line.split()[0], line.split()[1:]
        labels.append(int(label))

        for word in words:
            if word in stopwords_set:
                continue
            lemmatized = stemmer.stem(word)
            if test == 0:
                ids[lemmatized]
            words_preprocessed.append(lemmatized)
        data_in_preprocessed.append(words_preprocessed)

    return data_in_preprocessed, labels
コード例 #14
0
def postgre_retrieve_sentences(word, language):
    conn_string = "dbname= 'postgres' user='******' host='postgre-psd.postgres.database.azure.com' password='******' port='5432' "

    con = psycopg2.connect(conn_string)
    curs = con.cursor()

    #word = ls.stem("eyes")
    #word = "банковский"
    #language = "russian"
    #executing the query for retrieval
    if language == "english":
        ls = stem.LancasterStemmer()
        word = ls.stem(word)
        curs.execute(f"select sentence from {language} where word='{word}'")
        x = curs.fetchall()
        return x
    if language == "russian":
        curs.execute(f"select sentence from {language} where word='{word}'")
        x = curs.fetchall()
        return x
    if language == "turkish":
        curs.execute(f"select sentence from {language} where word='{word}'")
        x = curs.fetchall()
        return x
コード例 #15
0
 def stem_text_contents(self):
     if stemmer_name == "Porter-Stemmer":
         stemmer = stem.PorterStemmer()
         for counter in range(len(self.text)):
             text_tokens = self.text[counter].split()
             stem_text = ""
             for t in text_tokens:
                 root_word = t
                 stem_text = stem_text + stemmer.stem(t) + " "
                 stem_to_root[stemmer.stem(t)] = root_word
             stem_text = stem_text.strip(" ")
             self.stem_text.append(stem_text)
     elif stemmer_name == "Lancaster-Stemmer":
         stemmer = stem.LancasterStemmer()
         for counter in range(len(self.text)):
             text_tokens = self.text[counter].split()
             stem_text = ""
             for t in text_tokens:
                 root_word = t
                 stem_text = stem_text + stemmer.stem(t) + " "
                 stem_to_root[stemmer.stem(t)] = root_word
             stem_text = stem_text.strip(" ")
             self.stem_text.append(stem_text)
     elif stemmer_name == "WordNet-Lemmatizer":
         stemmer = WordNetLemmatizer()
         for counter in range(len(self.text)):
             text_tokens = self.text[counter].split()
             stem_text = ""
             for t in text_tokens:
                 root_word = t
                 stem_text = stem_text + stemmer.lemmatize(t) + " "
                 stem_to_root[stemmer.stem(t)] = root_word
             stem_text = stem_text.strip(" ")
             self.stem_text.append(stem_text)
     else:
         self.stem_text = self.text
コード例 #16
0
'''
usage
$ python knock50.py| python knock51.py | python knock52.py
'''
from nltk import stem
import sys

lines_list = sys.stdin.readlines()

for line in lines_list:
    l = line.strip('\n')
    porter_stm = stem.PorterStemmer()
    lancas_stm = stem.LancasterStemmer()
    p_gokan = porter_stm.stem(l.replace('\n','').lower())
    l_gokan = lancas_stm.stem(l.replace('\n','').lower())
    print("{0}\t{1}\t{2}".format(l, p_gokan, l_gokan))
コード例 #17
0
def test_stemming(word):
    print 'WordNetLemmatizer:', stem.WordNetLemmatizer().lemmatize(word)
    print 'LancasterStemmer:', stem.LancasterStemmer().stem(word)
    print 'PorterStemmer:', stem.PorterStemmer().stem(word)
    print 'RegexpStemmer:', stem.RegexpStemmer('ing$|s$|e$', min=4).stem(word)
    print 'SnowballStemmer:', stem.SnowballStemmer('english').stem(word)
コード例 #18
0
ファイル: hello.py プロジェクト: laurii/learning-flask
 def __init__(self):
     self.stopwords = stopwords.words('english')
     self._lancaster = stem.LancasterStemmer()
     self._porter = stem.PorterStemmer()
     self._lemmatizer = stem.WordNetLemmatizer()
コード例 #19
0
import re
from collections import defaultdict
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import stem
stemmer = stem.LancasterStemmer()
#-----------  MINING PHRASES FROM TRAINING DATA --------------


def getSynonyms(word):
    syns = wn.synsets(word)
    synonyms = [item.lemmas()[0].name() for item in syns]
    return list(set(synonyms))
    # print(unknown + ': ')
    # print(list(set(synonyms)))

def topFreq(wordList):
    freqs = {}
    for word in wordList:
        freqs[word] = train_dic[word]
    for k in reversed(sorted(freqs, key=lambda k:freqs[k])):
        if freqs[k] > 0: return k
    return None

train_phrases = open('../data/training_text', 'r')
test_phrases = open('../data/test_text', 'r')
train_phrases.readline()
test_phrases.readline()

train_dic = defaultdict(int)
test_dic = defaultdict(int)
コード例 #20
0
def stemming(words):
    ls = stem.LancasterStemmer()
    stemmed_words = []
    for word in words:
        stemmed_words.append(ls.stem(word))
    return (stemmed_words)