def get_stemmer(name):
    if name == 'porter':
        return porterstemmer.Stemmer()
    elif name == 'krovetz':
        return krovetzstemmer.Stemmer()
    else:
        return None
Beispiel #2
0
 def __init__(self, args, nlp=None, idf=None):
     self.args = args
     self.index_path = PATHS['galago_idx']
     self.db_path = PATHS['concepts_db']
     self.tokenizer = DEFAULTS['tokenizer']()
     self.ngrams = 2
     self.use_stemmer_threshold = 3  # idf ratio of stemmed/non-stemmed token
     self.nlp = nlp
     self.idf = idf
     self.stemmer = krovetzstemmer.Stemmer()
     self.cnx = None
def create_tdfidf_dicts_per_doc_for_file(doc_filepath, is_init=False):
    fulltext_dict = read_current_doc_file(doc_filepath, is_init)
    df_dict = {}
    stemmer = krovetzstemmer.Stemmer()
    for query in fulltext_dict:
        for user in fulltext_dict[query]:
            fulltext = re.sub('[^a-zA-Z0-9 ]', ' ',
                              fulltext_dict[query][user]['FullText'])
            fulltext_dict[query][user]['TfDict'] = {}
            fulltext_dict[query][user]['StemList'] = []
            curr_fulltext_list = fulltext.split(" ")
            for stem in curr_fulltext_list:
                stem = stemmer.stem(stem)
                if stem == '' or stem == '\n':
                    continue
                if stem not in fulltext_dict[query][user]['TfDict']:
                    fulltext_dict[query][user]['StemList'].append(stem)
                    fulltext_dict[query][user]['TfDict'][stem] = 1
                else:
                    fulltext_dict[query][user]['TfDict'][stem] += 1
            for stem in fulltext_dict[query][user]['StemList']:
                if stem in df_dict:
                    df_dict[stem] += 1
                else:
                    df_dict[stem] = 1

    for query in fulltext_dict:
        for user in fulltext_dict[query]:
            fulltext_dict[query][user]['DFList'] = []
            fulltext_dict[query][user]['TFList'] = []
            for stem in fulltext_dict[query][user]['StemList']:
                fulltext_dict[query][user]['DFList'].append(df_dict[stem])
                fulltext_dict[query][user]['TFList'].append(
                    fulltext_dict[query][user]['TfDict'][stem])
            fulltext_dict[query][user]['TfIdf'] = calc_tfidf_dict(
                stem_list=fulltext_dict[query][user]['StemList'],
                tf_list=fulltext_dict[query][user]['TFList'],
                df_list=fulltext_dict[query][user]['DFList'])

    return fulltext_dict
Beispiel #4
0
 def __init__(self, printer):
     self.printer = printer
     self.regex_drop_char = re.compile('[^a-z0-9\s]+')
     self.regex_multi_space = re.compile('\s+')
     self.stemmer = krovetzstemmer.Stemmer()
     self.stop_words = [
         'a', 'able', 'about', 'above', 'according', 'accordingly',
         'across', 'actually', 'after', 'afterwards', 'again', 'against',
         'ain', 'all', 'allow', 'allows', 'almost', 'alone', 'along',
         'already', 'also', 'although', 'always', 'am', 'among', 'amongst',
         'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone',
         'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear',
         'appreciate', 'appropriate', 'are', 'aren', 'around', 'as',
         'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away',
         'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes',
         'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
         'believe', 'below', 'beside', 'besides', 'best', 'better',
         'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', 'came',
         'can', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly',
         'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning',
         'consequently', 'consider', 'considering', 'contain', 'containing',
         'contains', 'corresponding', 'could', 'couldn', 'course',
         'currently', 'd', 'definitely', 'described', 'despite', 'did',
         'didn', 'different', 'do', 'does', 'doesn', 'doing', 'don', 'done',
         'down', 'downwards', 'during', 'e', 'each', 'edu', 'eg', 'eight',
         'either', 'else', 'elsewhere', 'enough', 'entirely', 'especially',
         'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone',
         'everything', 'everywhere', 'ex', 'exactly', 'example', 'except',
         'f', 'far', 'few', 'fifth', 'first', 'five', 'followed',
         'following', 'follows', 'for', 'former', 'formerly', 'forth',
         'four', 'from', 'further', 'furthermore', 'g', 'get', 'gets',
         'getting', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got',
         'gotten', 'greetings', 'h', 'had', 'hadn', 'happens', 'hardly',
         'has', 'hasn', 'have', 'haven', 'having', 'he', 'hello', 'help',
         'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
         'hereupon', 'hers', 'herself', 'hi', 'him', 'himself', 'his',
         'hither', 'hopefully', 'how', 'howbeit', 'however', 'i', 'ie',
         'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed',
         'indicate', 'indicated', 'indicates', 'inner', 'insofar',
         'instead', 'into', 'inward', 'is', 'isn', 'it', 'its', 'itself',
         'j', 'just', 'k', 'keep', 'keeps', 'kept', 'know', 'knows',
         'known', 'l', 'last', 'lately', 'later', 'latter', 'latterly',
         'least', 'less', 'lest', 'let', 'like', 'liked', 'likely',
         'little', 'll', 'look', 'looking', 'looks', 'ltd', 'm', 'mainly',
         'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely',
         'might', 'more', 'moreover', 'most', 'mostly', 'much', 'must',
         'my', 'myself', 'n', 'name', 'namely', 'nd', 'near', 'nearly',
         'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless',
         'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone',
         'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere',
         'o', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old',
         'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other',
         'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out',
         'outside', 'over', 'overall', 'own', 'p', 'particular',
         'particularly', 'per', 'perhaps', 'placed', 'please', 'plus',
         'possible', 'presumably', 'probably', 'provides', 'q', 'que',
         'quite', 'qv', 'r', 'rather', 'rd', 're', 'really', 'reasonably',
         'regarding', 'regardless', 'regards', 'relatively', 'respectively',
         'right', 's', 'said', 'same', 'saw', 'say', 'saying', 'says',
         'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming',
         'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious',
         'seriously', 'seven', 'several', 'shall', 'she', 'should',
         'shouldn', 'since', 'six', 'so', 'some', 'somebody', 'somehow',
         'someone', 'something', 'sometime', 'sometimes', 'somewhat',
         'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying',
         'still', 'sub', 'such', 'sup', 'sure', 't', 'take', 'taken',
         'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that',
         'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then',
         'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein',
         'theres', 'thereupon', 'these', 'they', 'think', 'third', 'this',
         'thorough', 'thoroughly', 'those', 'though', 'three', 'through',
         'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took',
         'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying',
         'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless',
         'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used',
         'useful', 'uses', 'using', 'usually', 'uucp', 'v', 've', 'value',
         'various', 'very', 'via', 'viz', 'vs', 'w', 'want', 'wants', 'was',
         'wasn', 'way', 'we', 'welcome', 'well', 'went', 'were', 'weren',
         'what', 'whatever', 'when', 'whence', 'whenever', 'where',
         'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
         'wherever', 'whether', 'which', 'while', 'whither', 'who',
         'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'willing',
         'wish', 'with', 'within', 'without', 'won', 'wonder', 'would',
         'would', 'wouldn', 'x', 'y', 'yes', 'yet', 'you', 'youve', 'your',
         'youre', 'yours', 'yourself', 'yourselves', 'z', 'zero'
     ]
Beispiel #5
0
import io
import sys

import html2text as html2text
import krovetzstemmer
from nltk import PorterStemmer

# Instantiate porter stemmer
porter = PorterStemmer()
krovetz = krovetzstemmer.Stemmer()

if len(sys.argv) < 6:
    print('Usage :')
    print('python 4_6.py <file_1> ... <file_5>')

# Assuming all arguments are file
files = []
for arg in range(1, len(sys.argv)):
    files.append(sys.argv[arg])

# Get contents of each file
results = {}
for idx, file in enumerate(files):
    print('{} of {}. Processing {}'.format(idx + 1, len(files), file))
    print('=' * 30)

    # get text content
    h = html2text.HTML2Text()
    h.ignore_links = True
    text = h.handle(u' '.join([
        line.strip()
 def __init__(self, stemmer=None, stoplist=None):
     self.stemmer = stemmer if stemmer else krovetzstemmer.Stemmer()
     self.stoplist = stoplist if stoplist else set()
import nltk
import krovetzstemmer
import unicodedata
import posixpath

#print(posixpath.basename("s3://csci-e29/project/asldk/abc.txt"))
"""
a = "  \n \n a \n \n b"
print(nltk.word_tokenize(a))
"""
s = 'Kästner'


def ud():
    return unicodedata.normalize('NFKD', s).encode('ASCII', 'ignore')


print(ud())
s = ud().decode("utf-8")

if (s.isalnum() and s.lower() not in nltk.corpus.stopwords.words("english")):
    word = krovetzstemmer.Stemmer().stem(s)
print(word)
Beispiel #8
0
#!/usr/bin/env python
'''
| Filename    : util_cleanup_tokens.py
| Description : Receive tokens on stdin and produce a list of clean tokens on stdout
| Author      : Pushpendre Rastogi
| Created     : Sun Dec 11 18:21:13 2016 (-0500)
| Last-Updated: Sun Dec 11 19:41:37 2016 (-0500)
|           By: Pushpendre Rastogi
|     Update #: 10
'''
import re
import krovetzstemmer
from string import maketrans
import sys
stemmer = krovetzstemmer.Stemmer().stem
PUNCT_CHAR = frozenset(''.join(chr(e) for e in range(
    33, 48) + range(58, 65) + range(91, 97) + range(123, 127)))
REGEX_SPECIAL_CHAR = frozenset(r'[]().-|^{}*+$\?')
keep = False
keep_or_remove_punct = ('([%s])' if keep else '[%s]')
PUNCT_MATCH_REGEX = re.compile(
    keep_or_remove_punct%(''.join(
        ('\\%s'%e if e in REGEX_SPECIAL_CHAR else e)
        for e in PUNCT_CHAR)))
num2zero_table = maketrans("0123456789", "0000000000")
for row in sys.stdin:
    row = row.strip()
    if row != '':
        print row,
        for e in re.split(PUNCT_MATCH_REGEX, row.lower().translate(None, ".")):
            if e != '':