def stem_words(string, sep=" "):
    """Stem words in a string. 
    
    The function tries first to stem the word in Italian if nothing 
    happens then switch to English.
    
    Parameters
    ----------
    string : string
        a string containing words separated by some separator

    sep : string 
        a separator (default ' ')


    Returns
    -------
    A string stemmed
    """

    stemmerIta = stem.SnowballStemmer("italian")
    stemmerEng = stem.SnowballStemmer("english")

    string = string.split(sep)
    string = [
        stemmerIta.stem(i) if stemmerIta.stem(i) != i else stemmerEng.stem(i)
        for i in string
    ]

    return sep.join(string)
Example #2
0
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = clean_tweet(text)
    print(text, '\n')
    text = text.lower()  # lowercase text
    print(text, '\n')
    text = REPLACE_BY_SPACE_RE.sub(
        ' ', text
    )  # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    print(text, '\n')
    text = BAD_SYMBOLS_RE.sub(
        ' ', text
    )  # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    print(text, '\n')
    text = text.replace('rt', ' ')
    print(text, '\n')
    text = re.sub(r'\d+', '', text)
    text = text.replace('&#', ' ')
    print(text, '\n')
    text = re.sub(r'\W+', ' ', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    text = text.split()
    lemm = Stem.SnowballStemmer("english")
    lemm_words = [lemm.stem(word) for word in text]
    text = " ".join(lemm_words)
    print(text, '\n')
    return text
Example #3
0
    def __init__(self,
                 weight_gpop=0,
                 stemm=True,
                 stemmer='porter',
                 tokenize=True,
                 clean=True,
                 synonyms=False,
                 fuzzy=True,
                 fuzz_thres=0,
                 add_artists=False,
                 add_albums=False,
                 return_num_predictions=500):
        '''
        Constructor
        '''
        self.weight_gpop = weight_gpop
        self.return_num_predictions = return_num_predictions
        self.add_artists = add_artists
        self.add_albums = add_albums

        self.stemm = stemm
        self.tokenize = tokenize
        self.clean = clean
        self.fuzzy = fuzzy
        self.fuzz_thres = fuzz_thres

        if stemmer == 'wn':
            self.stemmer = stem.WordNetLemmatizer()
        elif stemmer == 'porter':
            self.stemmer = stem.PorterStemmer()
        elif stemmer == 'snowball':
            self.stemmer = stem.SnowballStemmer('english')
        self.stemmers = stemmer

        self.synonyms = synonyms
Example #4
0
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to',
                           default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'
Example #5
0
 def is_unique(self, word):
     stemmer = stem.SnowballStemmer("english")
     root = stemmer.stem(word)
     for card in self.cards:
         if root == stemmer.stem(card.word):
             return False
     return True
Example #6
0
def reduz_ao_radical_stem(palavras):
    snow = stem.SnowballStemmer('english')
    if type(palavras) is str:
        return snow.stem(palavras)
    if type(palavras) is list:
        for i in range(len(palavras)):
            palavras[i] = snow.stem(palavras[i])
    return palavras
Example #7
0
def stemwords(words, stemmer=None):
    if not stemmer:
        stemmer = stem.SnowballStemmer('english')
    stemmed_word = list()
    for i in range(len(words)):
        words[i] = stemmer.stem(words[i])

    return words
Example #8
0
def stemming(x):
    stemmer = stem.SnowballStemmer("english")
    words = x.split()
    doc = []
    for word in words:
        word = stemmer.stem(word)
        doc.append(word)
    return " ".join(doc)
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to',
                           default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Try lemmatization filter
    try:
        nltk.find('taggers/universal_tagset')
    except LookupError:
        nltk.download('universal_tagset')
    try:
        nltk.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    stim = ComplexTextStim(text='These are tests for Stemming filters')
    filt = WordStemmingFilter(stemmer='wordnet')
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['these', 'be', 'test', 'for', 'stem', 'filter']
    assert lemmas == target

    # Try case sensitive
    filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
    assert lemmas == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'
Example #10
0
 def __init__(self, stemmer_type):
     if stemmer_type == 'porter':
         self.stemmer = stemming.PorterStemmer()
     elif stemmer_type == 'snowball':
         self.stemmer = stemming.SnowballStemmer('english')
     elif stemmer_type == 'lemmatize':
         self.stemmer = WordNetStemmer()
     else:
         raise NameError('\'%s\' not supported' % stemmer_type)
def stem_words(msg):
    from nltk import stem

    stemmer = stem.SnowballStemmer('english')

    msg = [word for word in msg.split()]

    msg = " ".join([stemmer.stem(word) for word in msg])

    return msg
Example #12
0
    def __init__(self):
        self.n_grams = [1, 2, 3, 4]
        self.stem_generator = stem.SnowballStemmer("english")

        self.total_num = 0
        self.generate_corpus = []
        self.reference_corpus = []
        self.gen_corpus_count = []
        self.ref_corpus_count = []
        self.gen_document_frequency = defaultdict(int)
        self.ref_document_frequency = defaultdict(int)
Example #13
0
def english_token(sentence, tokenize_flag=1, is_filter_stopword=1, stem_flag=1, lemma_flag=1):
    # 两种英文分词方式, 2更优
    if tokenize_flag == 1:
        source_tokens = word_tokenize(sentence)
    elif tokenize_flag == 2:
        tokenizer = tokenize.WordPunctTokenizer()
        source_tokens = tokenizer.tokenize(sentence)
    # print(source_tokens)

    # 删除标点符号
    for token in source_tokens[::-1]:
        if len(token) == 1 and token[0].isalpha() == False:
            source_tokens.remove(token)

    # 过滤停用词
    if is_filter_stopword:
        list_stopWords = list(set(corpus.stopwords.words('english')))
        filtered_stop_words = [w for w in source_tokens if not w in list_stopWords]
    else:
        filtered_stop_words = source_tokens
    # print(filtered_stop_words)

    # 两种词干化处理工具,2更优
    stem_tokens = []
    if stem_flag == 1:
        porterStemmer = stem.PorterStemmer()
        for word in filtered_stop_words:
            stem_tokens.append(porterStemmer.stem(word))
    elif stem_flag == 2:
        snowballStemmer = stem.SnowballStemmer('english')
        for word in filtered_stop_words:
            stem_tokens.append(snowballStemmer.stem(word))

    # 将动名词词型还原,2更优
    lemma_tokens = []
    if lemma_flag == 1:
        lemmatizer = stem.WordNetLemmatizer()
        for word in stem_tokens:
            # 将名词还原为单数形式
            n_lemma = lemmatizer.lemmatize(word, pos='n')
            # 将动词还原为原型形式
            v_lemma = lemmatizer.lemmatize(n_lemma, pos='v')
            # print('%8s %8s %8s' % (word, n_lemma, v_lemma))
            lemma_tokens.append(v_lemma)
    elif lemma_flag == 2:
        lemmatizer = stem.wordnet.WordNetLemmatizer()
        tagged_corpus = pos_tag(stem_tokens)
        for token, tag in tagged_corpus:
            if tag[0].lower() in ['n', 'v']:
                lemma_tokens.append(lemmatizer.lemmatize(token, tag[0].lower()))
            else:
                lemma_tokens.append(token)

    return lemma_tokens
def preprocess(News: List[str]):
    """
    :param News: 新闻列表
    :return: 词根化处理之后的新闻列表,词根化需要分词但是后续的tf-idf-transform要一个整句子所以我分词后又拼回去了, 返回List[str]
    """
    print("Now start to stemming tokens in news...")
    sb_stemmer = ns.SnowballStemmer("english")
    stem_news = []
    for doc in tqdm(News):
        words = tk.word_tokenize(doc)
        words = [sb_stemmer.stem(each) for each in words]
        stem_news.append(' '.join(words))
    return stem_news
Example #15
0
def review_messages(texts):
    stoplist = stopwords.words('english')
    stoplist.append('Subject')
    stemmer = stem.SnowballStemmer('english')

    words = process_email(texts)
    msg = []
    for word in words:
        if word not in stoplist:
            msg.append(word)
    # using a stemmer
    msg = " ".join([stemmer.stem(word) for word in msg])
    return msg
Example #16
0
def tokenize_comment(comment, voc, voc_index):
    tokenizer = tokenize.RegexpTokenizer(r'\w+')
    stemmer = stem.SnowballStemmer('russian')
    result = []
    for sent in tokenize.sent_tokenize(comment):
        filtered = [word for word in tokenizer.tokenize(sent) \
                if word not in corpus.stopwords.words('russian')]
        stemmed = [stemmer.stem(word) for word in filtered]
        for word in stemmed:
            if voc.get(word) == None:
                voc[word] = voc_index
                voc_index += 1
        result += stemmed
    return voc_index, result
Example #17
0
def clean_text(text):
    text = text.lower()

    # Lancaster is strictest and PorterStemmer is least strict.
    # Snowball is in the middle, so we use Snowball here for stemming text.
    stemmer = stem.SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(text)
    not_punctuation = [
        word for word in word_tokens if word not in string.punctuation
    ]
    cleaned = [word for word in not_punctuation if word not in stop_words]
    cleaned_stem = [stemmer.stem(word) for word in cleaned]
    cleaned_join = " ".join(cleaned_stem)

    return cleaned_join
Example #18
0
    def __call__(self, doc):

        tokens = tokenizer.word_tokenize(doc)
        tokens = [
            token for token in tokens
            if token.isalnum() and len(token) > 0 and not token.isspace()
        ]

        if self.eliminate_stopwords:

            stop_words = stopwords.words("english")
            tokens = [token for token in tokens if token not in stop_words]

        if self.apply_stemming:
            snowball_stemmer = stemmer.SnowballStemmer("english")
            tokens = [snowball_stemmer.stem(token) for token in tokens]

        return tokens
Example #19
0
def vectorize(input, stop_words, max_words):
    """
    CREATE FEATURES BASED ON THE STEMMED UNIGRAM AND BIGRAM TFIDF VALUES
    input: list of text reviews
    stop_words: additional common words not used as features
    max_words: total number of features for classifier, typically 10,000
    """
    token = TfidfVectorizer().build_tokenizer()
    stemmer = stem.SnowballStemmer("english", ignore_stopwords=True)
    stopW = map(stemmer.stem, stopwords.words('english') + stop_words)

    def tstem(text):
        return map(stemmer.stem, token(text))

    tfidf = TfidfVectorizer(max_features=max_words,
                            ngram_range=(1, 2),
                            stop_words=stopW,
                            tokenizer=tstem)

    return tfidf.fit(input)
Example #20
0
def preprocessing(email):
    # 1. 统一成小写
    email = email.lower()

    # 2. 去除html标签
    email = re.sub('<[^<>]>', ' ', email)

    # 3. 将网址替换为字符串 “httpaddr”.
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)

    # 4. 将邮箱地址替换为 “emailaddr”
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)

    # 5.所有dollar符号($)替换为“dollar”.
    email = re.sub('[\$]+', 'dollar', email)

    # 6.匹配数字,将数字替换为“number”
    email = re.sub('[0-9]+', 'number', email)  # 匹配一个数字, 相当于 [0-9],+ 匹配1到多次

    # 7. 词干提取
    tokens = re.split(
        '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist = []

    s = ns.SnowballStemmer('english')

    for token in tokens:

        # 8. 移除非文字类型
        email = re.sub('[^a-zA-Z0-9]', '', email)
        stemmed = s.stem(token)

        # 9.去除空字符串‘’
        if not len(token): continue
        tokenlist.append(stemmed)

    return tokenlist
def scrub_txt(dirty_txt):
    '''scrub text: lower, stop words, and stem

    Argument(s):
    dirty_txt (str): string to clean

    Return:
    clean_txt (str): clean string
    '''

    # potential improvements:
    # - remove punctuation

    stemmer = stem.SnowballStemmer('english')
    eng_stopwords = set(stopwords.words('english'))

    # lower, stop, stem
    clean_txt = dirty_txt.lower()
    clean_txt = [
        word for word in clean_txt.split() if word not in eng_stopwords
    ]
    clean_txt = " ".join([stemmer.stem(word) for word in clean_txt])

    return clean_txt
Example #22
0

def split_sentences2(text):
    return nltk.sent_tokenize(text)


if __name__ == '__main__':

    w = "w"
    a = "a"
    print(w, end="")
    print()
    print(a)

    stemmer = stem.PorterStemmer()
    stemmer2 = stem.SnowballStemmer(language="english")
    print(stemmer.stem("colonization"))
    print(stemmer2.stem("colonization"))
    print(len(stopwords.words("english")))

    lang = "tr"
    folderpath = "/home/dicle/Documents/data/tr/radikal_5class_newstexts/ekonomi"
    #instances = corpus_utils.read_n_files(folderpath, N=2)
    instances, labels = corpus_utils.get_20newsgroup()
    instances = instances[:2]
    for i, text in enumerate(instances):
        print(i, " Sentences:")
        print(split_sentences1(text))
        print("####")
        print(split_sentences2(text))
        print(
Example #23
0
 def language(self, value):
     self._language = value
     self.normalizer = stem.SnowballStemmer(self.language.lower())
Example #24
0
 def __init__(self, language='English'):
     self._language = language
     self.normalizer = stem.SnowballStemmer(self.language.lower())
 def __init__(self):
     self.stemmer = stem.SnowballStemmer('english')
     self.dict = {}
Example #26
0
def alignment_score(ppdbLines, word1, word2):
    stemmer = stem.SnowballStemmer("english")
    if (stemmer.stem(word1) == stemmer.stem(word2)):
        return MAX_ALIGNMENT_SCORE
    else:
        return _paraphrase_score(ppdbLines, word1, word2)
Example #27
0
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" % (best_i, best_dist))

vectorizer = CountVectorizer(
    min_df=1,
    stop_words='english')  #stopwords是停用词,如果定义了就会有很多类似于most啊、a啊、about啊不被统计
sorted(vectorizer.get_stop_words())[:50]  # 大约有多少呢?  318个
len(vectorizer.get_stop_words())

# 同语义的词的去重,需要下载一个包....好像不用》。。。!!!
# 正统的叫法是词干处理~

from nltk import stem

english_stemmer = stem.SnowballStemmer('english')  # 有很多,英语的用Snowball吧
english_stemmer.stem('imaging')
english_stemmer.stem('image')
english_stemmer.stem('imagine')
english_stemmer.stem('buys')
english_stemmer.stem('buying')
english_stemmer.stem('bought')


class StemmedCountVectorizer(CountVectorizer):
    # 创建一个类
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
        # 这个方法确实挺高级的感觉....
    logger.debug("train size:" + str(train_data.shape) + " test size:" +
                 str(eval_data.shape))
    label = 'author'
    eval_id = eval_data['id']

    # CREATE TARGET VARIABLE
    logger.debug("One hot encoding for label")
    train_data["EAP"] = (train_data.author == "EAP") * 1
    train_data["HPL"] = (train_data.author == "HPL") * 1
    train_data["MWS"] = (train_data.author == "MWS") * 1
    target_vars = ["EAP", "HPL", "MWS"]
    Y_train = train_data[target_vars].values

    # STEMMING WORDS
    logger.debug("Sterm text ..")
    stemmer = stm.SnowballStemmer("english")
    stem_text = train_data.text.apply(lambda x: (" ").join(
        [stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]", " ", x).split(" ")]))
    eval_stem_text = eval_data.text.apply(lambda x: (" ").join(
        [stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]", " ", x).split(" ")]))
    all_sterm_text = pd.concat([stem_text, eval_stem_text])

    logger.debug("Tokenizing text ..")
    # prepare tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_sterm_text)
    vocab_size = len(tokenizer.word_index) + 1
    logger.debug("vocab size:" + str(vocab_size))
    # integer encode the documents
    encoded_text = tokenizer.texts_to_sequences(stem_text)
    # print(encoded_text[:3])
Example #29
0
# In[125]:

#lala = csv_data[['author_id', 'name', 'EXPERTISE', 'ins_name\r', 'URL']]
#lala = lala.replace({'ins_name\r': {r'\r':''}}, regex = True)

#lala.to_csv(r'C:\Users\admin\Desktop\professor_data.txt', header = None, index = None, sep = '\t', mode = 'a', encoding = 'utf-8')

# In[126]:

# In[127]:

STOP_WORDS_FILENAME = r'C:\Users\admin\Desktop\stop_words_topic.txt'

# In[128]:

eng_stemmer = stem.SnowballStemmer('english')


class Indexable(object):
    """Class representing an object that can be indexed.

    It is a general abstraction for indexable objects and can be used in

    different contexts.

    Args:

      iid (int): Identifier of indexable objects.

      metadata (str): Plain text with data to be indexed.
Example #30
0
def stemming(words):
    stemmer = stem.SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]
    return words