Esempio n. 1
0
def load_clueweb12_B13_termstat_stemmed() -> Tuple[Dict, Dict]:
    from krovetzstemmer import Stemmer
    stemmer = Stemmer()
    tf, df = load_clueweb12_B13_termstat()
    new_tf = Counter()

    for key, cnt in tf.items():
        new_tf[stemmer.stem(key)] += cnt
        pass

    df_info = defaultdict(list)
    for key, cnt in df.items():
        df_info[stemmer.stem(key)].append(cnt)

    new_df = Counter()
    for key, cnt_list in df_info.items():
        cnt_list.sort(reverse=True)
        discount = 1
        discount_factor = 0.3
        df_est = 0
        for cnt in cnt_list:
            df_est += cnt * discount
            discount *= discount_factor

        new_df[key] = int(df_est)
    return new_tf, new_df
def clean_text(text):
    stemmer = Stemmer()
    return [
        stemmer.stem(token.lower())
        for token in nltk.word_tokenize(re.sub("[ ]+", " ", re.sub("\n", "", text)))
        if (token.isalnum() and token not in stopwords.words("english"))
    ]
Esempio n. 3
0
def main():
    global args
    save_prefix = args.s

    if args.f is None:
        print('Please specify input file.')
        return

    # read file
    with open(args.f, 'r') as f:
        document = f.read()

    # cut puncation
    document = re.sub("[<>:;()\?\.\!\/_,&%^*(+\"\']+", " ", document)

    # stop word list, also can be replaced with to your's list
    stop_word_list = stopwords.words('english')

    # split word, lower word, filter stop word
    s = Stemmer()

    word_list = [
        s.stem(w.lower()) for w in word_tokenize(text=document)
        if w not in stop_word_list
    ]

    with open(save_prefix + '/result.txt', 'ab') as f:
        f.write(' '.join(w for w in word_list))
Esempio n. 4
0
def stem(algo, text):
    if algo == "krovetz":
        stemmer = Stemmer()
        return stemmer.stem(text)
    elif algo == "porter":
        stm = PorterStemmer()
        return stm.stem(text)
    print("ERROR STEMMING: {t} unkown.".format(t=algo))
Esempio n. 5
0
def stem(algo,text):
	if algo=="krovetz":
		stemmer = Stemmer()
		return(stemmer.stem(text))
	elif algo=="porter":
		s=stm()
		return(s.stem(text))
	else:
		print("ERROR STEMMING: {t} unkown.".format(t=algo))
		exit()
Esempio n. 6
0
def stem(text):
    # print("Stemming...")
    stemmer = Stemmer()
    stemmed = ""
    for word in text.split():
        if word == 'docid':
            stemmed = stemmed + '\n'
        stemmed = stemmed + ' ' + stemmer.stem(word)

    return stemmed
Esempio n. 7
0
def load_df_stemmed(term_stat_path):
    stemmer = Stemmer()
    df = load_df(term_stat_path)

    new_df = Counter()
    for key, value in df.items():
        try:
            new_df[stemmer.stem(key)] += value
        except UnicodeDecodeError:
            pass
    return new_df
Esempio n. 8
0
 def tokenize(text):
     stemmer = Stemmer()
     return [
         stemmer.stem(token.lower()) for token in nltk.word_tokenize(
             re.sub(
                 "\n",
                 "",
                 text.translate(
                     str.maketrans(punctuation, " " * len(punctuation))),
             )) if (token.isalnum() and token.lower() not in
                    stopwords.words("english") and len(token) > 1)
     ]
def read_past_winners_file(winners_file):
    winners_data ={}
    stemmer = Stemmer()
    with open(winners_file) as file:
        for line in file:
            query = line.split("@@@")[0]
            text = line.split("@@@")[1]
            if query not in winners_data:
                winners_data[query]=[]
            text = " ".join([stemmer.stem(word) for word in clean_text(text).split()])
            winners_data[query].append(text)
    return winners_data
Esempio n. 10
0
def modify_text(text, index, query):
    stemmer = Stemmer()
    query_terms = [stemmer.stem(q) for q in query.split()]
    new_text = ""

    if index == 4:
        new_text = query + text + query
        return new_text

    elif index == 0:
        p = 0.5
    elif index == 2:
        p = 0.2

    tokens = clean_texts(text).split()

    for token in tokens:
        if stemmer.stem(token) in query_terms:
            if random() < p:
                continue
        new_text += token + " "
    return new_text
Esempio n. 11
0
class PCTokenizer:
    def __init__(self):
        self.stemmer = Stemmer()

    def tokenize_stem(self, text: str) -> List[str]:
        tokens = nltk.tokenize.word_tokenize(text)
        stemmed_tokens = []
        for t in tokens:
            try:
                stemmed_tokens.append(self.stemmer.stem(t))
            except:
                pass

        return stemmed_tokens
Esempio n. 12
0
class Tokenizer:
    def __init__(self, vocab_path, unk="<UNK>", pad="<PAD>"):
        self.vocab_path = vocab_path
        self.unk = unk
        self.pad = pad
        self.word2idx = self.load_vocab(vocab_path)
        self.sws = {}
        for w in stopwords.words('english'):
            self.sws[w] = 1
        self.stemmer = Stemmer()

    def load_vocab(self, vocab_path):
        word2idx = {}
        word2idx[self.pad] = 0
        word2idx[self.unk] = 1
        with open(vocab_path) as fin:
            for step, line in enumerate(fin):
                tokens = line.strip().split()
                word2idx[tokens[0]] = step + 2
        return word2idx

    def tok2idx(self, toks, word2idx):
        input_ids = []
        for tok in toks:
            if tok in word2idx:
                input_ids.append(word2idx[tok])
            else:
                input_ids.append(word2idx['<UNK>'])
        return input_ids

    def tokenize(self, line):
        regex_drop_char = re.compile('[^a-z0-9\s]+')
        regex_multi_space = re.compile('\s+')
        toks = regex_multi_space.sub(' ', regex_drop_char.sub(
            ' ', line.lower())).strip().split()
        wordsFiltered = []
        for w in toks:
            if w not in self.sws:
                w = self.stemmer.stem(w)
                wordsFiltered.append(w)
        return wordsFiltered

    def convert_tokens_to_ids(self, toks):
        input_ids = []
        for tok in toks:
            if tok in self.word2idx:
                input_ids.append(self.word2idx[tok])
            else:
                input_ids.append(self.word2idx[self.unk])
        return input_ids
Esempio n. 13
0
class CacheStemmer:
    def __init__(self):
        self.stemmer = Stemmer()
        self.stem_dict = dict()

    def stem(self, token):
        if token in self.stem_dict:
            return self.stem_dict[token]
        else:
            r = self.stemmer.stem(token)
            self.stem_dict[token] = r
            return r

    def stem_list(self, tokens):
        return list([self.stem(t) for t in tokens])
Esempio n. 14
0
def data_to_wordsentences(raw_data):
    """ convert a text to list of sentences
    :param raw_data: a text to be converted
    :return: list if sentences
    """
    sentences = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(raw_data.text.strip())
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            words = tokenise_text(raw_sentence)
            stemmer = Stemmer()
            for idx, w in enumerate(words):
                words[idx] = stemmer.stem(w.decode("utf-8", "ignore"))
            sentences.append(words)
    return sentences
Esempio n. 15
0
def generate_sentences_list_from_raw_text_list(raw_text_list):
    """ convert list of texts into list of sentences for the traning of Word2Vec
    :param raw_text_list: list of texts to be converted
    :return: list if sentences
    """

    sentences_list = []
    stemmer = Stemmer()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for i in range(len(raw_text_list)):
        raw_sentences = tokenizer.tokenize(raw_text_list[i])
        for raw_sentence in raw_sentences:
            if len(raw_sentence) > 0:
                words = tokenise_text(raw_sentence)
                for idx, w in enumerate(words):
                    words[idx] = stemmer.stem(w.decode("utf-8", "ignore"))
                sentences_list.append(words)
    return sentences_list
Esempio n. 16
0
class StemmerCache:
    def __init__(self, cache=None):
        from krovetzstemmer import Stemmer
        self.stemmer = Stemmer()
        if cache is not None:
            self.cache = cache
        else:
            self.cache = dict()

    def stem(self, t):
        if t in self.cache:
            return self.cache[t]
        else:
            r = self.stemmer.stem(t)
            self.cache[t] = r
            if len(self.cache) % 1000 == 0:
                pickle.dump(self.cache, open("stemmer.pickle", "wb"))
            return r
def get_text_centroid(text, model,stemmer=None):
    sum_vector = None
    denom = 0
    if stemmer is not None:
        stem = Stemmer()
    for token in clean_sentence(text):
        if stemmer is not None:
            token = stem.stem(token)
        try:
            vector = model.wv[token]
        except KeyError:
            continue
        if sum_vector is None:
            sum_vector = np.zeros(vector.shape[0])
        sum_vector = sum_vector + vector
        denom += 1
    if sum_vector is None:
        return None
    return sum_vector / denom
Esempio n. 18
0
    def get_stemmed_words_index(self, window_words_index):
        '''
        Get stemmed-words index from window-words index
        :param window_words_index:
        :return:
        '''

        all_words = self.get_all_words(window_words_index)
        stem_words_index = {}

        krovetz = KrovetzStemmer()
        for word in all_words:
            # Stem word using krovetz
            stemmed_word = krovetz.stem(word)

            # Group by stemmed word
            stem_words_index.setdefault(stemmed_word, [])
            stem_words_index[stemmed_word].append(word)

        return stem_words_index
Esempio n. 19
0
    def __init__(self, d: Dict[WordAsID, np.array], skip_stopwords=True, stem=True):
        self.tokenizer = get_tokenizer()

        self.stopwords_as_ids: Set[WordAsID] = set()
        new_d = {}
        if skip_stopwords:
            stopwords = load_stopwords_for_query()
            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                if len(tokens) == 1 and tokens[0] in stopwords:
                    pass
                    self.stopwords_as_ids.add(key)
                else:
                    new_d[key] = d[key]
            d = new_d

        if stem:
            d_raw = defaultdict(list)
            stemmer = Stemmer()

            for key in d.keys():
                tokens = decode_word_as_id(self.tokenizer, key)
                plain_word = pretty_tokens(tokens, True)
                stemmed = stemmer.stem(plain_word)
                d_raw[stemmed].append(d[key])

            new_d: Dict[str, TokenScore] = {}
            for key, items in d_raw.items():
                score: TokenScore = [average([t[0] for t in items]), average([t[1] for t in items])]
                new_d[key] = score
            d = new_d
            self.stem = True
            self.stemmer = stemmer
            self.log_odd = self.log_odd_w_stem

        self.d = d
        self.smoothing = 0.1
    tok = Tokenizer()


a = A()
text = "She even shows-me her boobs and I like it.\nHello world!"
print(A.tok(text))

print(
    list(token.lower() for token in word_tokenize(
        re.sub(
            "\n", "",
            text.translate(str.maketrans(punctuation, " " *
                                         len(punctuation)))))
         if token.isalnum))
print(
    list(token.lower() for token in word_tokenize(
        re.sub(
            "\n", "",
            text.translate(str.maketrans(punctuation, " " *
                                         len(punctuation)))))
         if token.isalnum and token.lower() not in stopwords.words("english")))
stemmer = Stemmer()
print(
    list(
        stemmer.stem(token.lower()) for token in word_tokenize(
            re.sub(
                "\n", "",
                text.translate(
                    str.maketrans(punctuation, " " * len(punctuation)))))
        if token.isalnum and token.lower() not in stopwords.words("english")))
def get_term_frequency(text,term):
    stemmer = Stemmer()
    return [stemmer.stem(token) for token in text.split()].count(term)
Esempio n. 22
0
from krovetzstemmer import Stemmer as KrovetzStemmer
import unicodecsv as csv
from prettyprint import prettyprint

# Instantiate krovetz stemmer
krovetz = KrovetzStemmer()

# Read result of 1_index
with open('1_2_index.txt', 'rb') as f:
    str_word_files_index = f.read()
    word_files_index = json.loads(str_word_files_index)

    stem_word_index = {}
    for word, files in word_files_index.items():
        # Stem word using krovetz
        stemmed_word = krovetz.stem(word)

        # Group by stemmed word
        stem_word_index.setdefault(stemmed_word, [])
        stem_word_index[stemmed_word].append(word)

    for stemmed_word, words in stem_word_index.items():
        print(u'{}: {}'.format(stemmed_word, ', '.join(words)))

    print ''
    filename = '3_stemmed_words.csv'
    with open(filename, 'wb') as f:
        print('Writing to file {}'.format(filename))

        writer = csv.writer(f)
        for stemmed_word, words in stem_word_index.items():
from sklearn.feature_extraction.text import CountVectorizer
import codecs

EPS = 10e-7

import string
table = str.maketrans('', '', '!"#$%\'()*+,-./:;<=>?@[\\]^_`{|}~')

# Krovetz stemmer est un stemmer moins "destructif" que le porter.
# Viewing morphology as an inference process: https://dl.acm.org/citation.cfm?id=160718
from krovetzstemmer import Stemmer  #stemmer pas mal pour la PR
ks = Stemmer()

CUSTOM_FILTERS = [
    lambda x: x.lower(), strip_tags, strip_multiple_whitespaces,
    strip_punctuation, remove_stopwords, lambda x: ks.stem(x)
]


def custom_tokenizer(s):
    return [
        w.translate(table) for w in preprocess_string(s, [
            strip_tags, lambda x: strip_short(x, 2), remove_stopwords,
            lambda x: ks.stem(x)
        ])
    ]


class Dataset:
    def __init__(self):
        pass
def get_sentence_vector(sentence, model):
    stemmer = Stemmer()
    sentence = clean_text(sentence)
    words = sentence.split()
    stemmed = [stemmer.stem(w) for w in words]
    return get_stemmed_document_vector(stemmed, model)
Esempio n. 25
0
if stemmed:

    stemmer = Stemmer()

    vocab = load_from_pickle_file("preprocessing/pre_data/vocabulary")

    for _, query in tqdm(queries_obj.items()):
        vocab.update(query.title.split())
        vocab.update(query.desc.split())

    mapping_stemmed = {}

    print("Stemming...")

    for word in tqdm(vocab):
        mapping_stemmed[word] = stemmer.stem(word)

    for _, doc in tqdm(corpus_obj.docs.items()):
        doc.headline = " ".join(
            [mapping_stemmed[word] for word in doc.headline.split()])
        doc.content = " ".join(
            [mapping_stemmed[word] for word in doc.content.split()])

    for _, query in tqdm(queries_obj.items()):
        query.title = " ".join(
            [mapping_stemmed[word] for word in query.title.split()])
        query.desc = " ".join(
            [mapping_stemmed[word] for word in query.desc.split()])

    corpus_sent = [
        list(map(lambda w: mapping_stemmed[w], sent))
Esempio n. 26
0
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from krovetzstemmer import Stemmer
stemmer = Stemmer()

s = "According to Wikipedia, Information Retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources."

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(s)
print tokens

words = [w.lower() for w in tokens]
print words

non_stopped_words = [w for w in words if not w in stopwords]
print non_stopped_words

stemmed_words = [stemmer.stem(w) for w in non_stopped_words]
print stemmed_words
Esempio n. 27
0
from Porter import PorterStemmer
from krovetzstemmer import Stemmer

from common import readTextFromFile
from common import getTextFromHTML

krov = Stemmer()

f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html'
text = getTextFromHTML(readTextFromFile(f))

print 'ori:\n', text, '\n'
print 'porter:\n', PorterStemmer.useStemer(text), '\n'
print 'krov:\n', krov.stem(text), '\n'
Esempio n. 28
0
class KrovetzStemmer(Stemming):
    def __init__(self):
        self.stemmer = Stemmer()

    def stem(self, text: Text) -> Text:
        return self.stemmer.stem(text)