class StemProvider(Provider): """Stem the input values (either a single word or a list of words) Uses the porter stemmer algorithm. """ def __init__(self, language='english', **kwargs): """ See here for a full list of languages: http://nltk.org/_modules/nltk/stem/snowball.html .. note:: This does not depend on nltk, it depends on the ``pystemmer`` package. :param language: language to use during stemming, defaults to english. """ Provider.__init__(self, **kwargs) self._stemmer = Stemmer(language) def do_process(self, input_value): if isinstance(input_value, str): return self._stemmer.stemWord(input_value) else: return self._stemmer.stemWords(input_value)
def classif(text, mass, num_all_docs, num_words_unic): stm = Stemmer('russian') text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*")) num_povt_words = 0 summa = 0 while_iter = 0 while while_iter < len(mass): summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1) for i in text: for i1 in mass[while_iter].lst_allword: if i == i1: num_povt_words = num_povt_words + 1 summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1) num_povt_words = 0 summa = summa + summand_2 mass[while_iter].c = summand_1 + summa summa = 0 while_iter = while_iter + 1 max_c = -100000 while_iter = 0 number_max = 0 while while_iter < len(mass): print mass[while_iter].c if mass[while_iter].c > max_c: max_c = mass[while_iter].c number_max = while_iter while_iter = while_iter + 1 print mass[number_max].name_categories
def run(): stemmer = Stemmer("english") pages = db.en.find() print colored.yellow("statistic words") wordstatistic = {} for page in progress.bar(pages,size=db.en.count()): data = page.get("data") if not data:continue content = data.get("content") if not content: db.en.remove({"_id":page["_id"]}) continue words = EN_WORD_CUT.split(content) for word in words: w=stemmer.stemWord(word.strip()).lower() if w and len(w)<20 and not w in EN_IGNORE: if wordstatistic.get(w): wordstatistic[w]+=1 else: wordstatistic[w]=1 print colored.yellow("save to en_words_freq") savequene = [] for k,v in progress.bar(wordstatistic.iteritems(),size=len(wordstatistic)): savequene.append({"_id":k,"freq":v}) if len(savequene) >=1000: db.en_words_freq.insert(savequene) savequene=[] if savequene:db.en_words_freq.insert(savequene) print colored.cyan( "count of en_words_freq: %d" % db.en_words_freq.count())
def getStems(cleanedText, stopWords): stems = {} matches = re.finditer(r'\w+(\.?\w+)*', cleanedText.strip(), flags=re.IGNORECASE) stemmer = Stemmer('english') #maxlength = sum(1 for _ in matches1) #stemmer.maxCacheSize = maxlength offset = len(termDict) tokenid = offset + 1 position = 0 for match in matches: #position = match.start() position += 1 token = match.group() filteredToken = filterToken(token, stopWords) if filteredToken and filteredToken is not None: wordStem = stemmer.stemWord(filteredToken.lower()) #present = wordStem in stems if wordStem not in stems: #tokenid += 1 stems[wordStem] = tokenid positions = set() positions.add(position) if wordStem not in termDict: termDict[wordStem] = tokenid terms[tokenid] = positions tokenid = tokenid + 1 else: stemid = termDict[wordStem] terms[stemid] = positions else: stemid = termDict[wordStem] postns = terms[stemid] postns.add(position) terms[stemid] = postns
def _prepare_text(self, text): """Extracts and stems the words from some given text. """ words = re.findall('[a-z0-9\']+', text.lower()) words = [word for word in words if word not in STOP_WORDS] stemmer = Stemmer('english') stemmed_words = stemmer.stemWords(words) return stemmed_words
def train(name_file_dbase, way_to_dbase): stm = Stemmer('russian') file_base = open(name_file_dbase, 'r') Lines = file_base.readlines() num_all_docs = len(Lines) + 1 mass = [] iter1 = 0 iter2 = 0 for line in Lines: number1, address1 = unpack_line(line) number = number1.strip("\n") address = address1.strip("\n") if (number == "1"): mass.append(Categories()) mass[iter1].name_categories = address1 mass[iter1 - 1].num_docs = iter2 iter1 = iter1 + 1 iter2 = 0 iter2 = iter2 + 1 mass[len(mass) - 1].num_docs = iter2 while_iter = 0 file_base.close() number = 1 while while_iter < len(mass): while number <= mass[while_iter].num_docs: file_forclass = open(way_to_dbase + mass[while_iter].name_categories + '/' + str(number) + 'forclass.txt', 'r') str_read = re.sub("^\s+|\n|\r|\s+$", ' ', file_forclass.read()) mass[while_iter].line_allword = mass[while_iter].line_allword + str_read file_forclass.close() number = number + 1 while_iter = while_iter + 1 number = 1 while_iter = 0 while while_iter < len(mass): forstemmer = mass[while_iter].line_allword.decode('UTF-8') str_read = stm.stemWords(regexp_tokenize(forstemmer.lower(), r"(?x) \w+ | \w+(-\w+)*")) mass[while_iter].num_words = len(str_read) mass[while_iter].lst_allword = str_read lst_unic_words = list(set(mass[while_iter].lst_allword)) mass[while_iter].num_wordsunic = len(lst_unic_words) while_iter = while_iter + 1 all_words = 0 num_words_unic = 0 while_iter = 0 while while_iter < len(mass): all_words = all_words + mass[while_iter].num_words num_words_unic = num_words_unic + mass[while_iter].num_wordsunic while_iter = while_iter + 1 return mass, num_all_docs, num_words_unic
def get_search_phrases(self, indexing_func=None): """Returns search phrases from properties in a given Model instance. Args (optional): only_index: List of strings. Restricts indexing to these property names. indexing_func: A function that returns a set of keywords or phrases. Note that the indexing_func can be passed in to allow more customized search phrase generation. Two model variables influence the output of this method: INDEX_ONLY: If None, all indexable properties are indexed. If a list of property names, only those properties are indexed. INDEX_MULTI_WORD: Class variable that allows multi-word search phrases like "statue of liberty." INDEX_STEMMING: Returns stemmed phrases. """ if not indexing_func: klass = self.__class__ if klass.INDEX_MULTI_WORD: indexing_func = klass.get_search_phraseset else: indexing_func = klass.get_simple_search_phraseset if self.INDEX_STEMMING: stemmer = Stemmer('english') phrases = set() # allow indexing of 'subentities' such as tasks of a list as well queries = [(self,self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES import logging for query, props in queries: entities = [] try: subentities = query(self).fetch(1000) # get all of them while len(subentities) > 0: entities.extend(subentities) last_key = subentities[-1].key() subentities = query(self).order('__key__').filter('__key__ >',last_key).fetch(1000) except TypeError, e: # query is not callable because it's an actual entity entities = [query] for entity in entities: for prop_name, prop_value in entity.properties().iteritems(): if not props or prop_name in props: values = prop_value.get_value_for_datastore(entity) if not isinstance(values, list): values = [values] if (isinstance(values[0], basestring) and not isinstance(values[0], datastore_types.Blob)): for value in values: words = indexing_func(value,add_stop_words=self.INDEX_ADD_STOP_WORDS) if self.INDEX_STEMMING: stemmed_words = set(stemmer.stemWords(words)) phrases.update(stemmed_words) else: phrases.update(words)
def make_index(expression): """ Make a standardization in the expression to return a tuple who maximise maching possibilities. expression must be a list or tuple """ stemmer = Stemmer("french") expression = [stemmer.stemWord(normalize_token(w)) for w in expression] expression.sort() return tuple(expression)
def processQueries(queries): queryList = [] for query in queries: filteredQuery = tokenize.filterToken(query, tokenize.getStopWords()) if filteredQuery and filteredQuery is not None: stemmer = Stemmer('english') queryStem = stemmer.stemWord(filteredQuery.lower()) queryList.append(queryStem) return queryList
def parse_html(html): words = dehtml(html) s = Stemmer("danish") result = [] for w in words.split(): word = w.lower() if word in stop_words or len(word) < 2 or word.count('\\'): continue result.append(s.stemWord(word)) return result
def getTerm(term): term_ids = {} term_ids_file = open(TERMIDSFILE, 'rU') for line in term_ids_file.readlines(): pieces = line.strip().split('\t') stemmer = Stemmer('english') #stemmer.maxCacheSize = 1 termStem = stemmer.stemWord(term.lower()) if termStem == pieces[1]: term_ids[pieces[1]] = int(pieces[0]) return term_ids term_ids_file.close() return term_ids
class BagOfWordsFeatureBooleanizer(FeatureBooleanizer): def __init__(self, featureName, featuresData, featureId): FeatureBooleanizer.__init__(self, featureName, featuresData, featureId) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') stopListFn = './resources/general/stopword.csv' self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines()))) allWords = set() if self.featureName == 'Basic: Tagline': for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(',')))) else: for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split()))) self.words = sorted(list(filter(None, allWords - self.stopList))) def preprocess(self, s): chars = [] for c in unidecode(s.strip().lower()): if c in self.goodChars: chars.append(c) word = ''.join(chars) return self.stemmer.stemWord(word) def getFeatureNames(self): return [self.featureName + ': ' + word for word in self.words] def process(self, v): vWords = set(map(lambda w: self.preprocess(w), filter(None, v.split(',')))) return [(word in vWords) for word in self.words]
class BagOfWordsFeatureSupport(FeatureSupport): def __init__(self, featuresData, featureId): FeatureSupport.__init__(self, featuresData, featureId) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') stopListFn = './resources/general/stopword.csv' self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines()))) def preprocess(self, s): chars = [] for c in unidecode(s.strip().lower()): if c in self.goodChars: chars.append(c) word = ''.join(chars) return self.stemmer.stemWord(word) def extract(self, i): bag = frozenset(map(lambda w: self.preprocess(w), filter(None, self[i].split()))) ret = bag - self.stopList if len(ret) == 0: ret = frozenset([''.join(random.choice('abcdefghjiklmnopqrstuvwxyz') for _ in range(20))]) return ret def similarity(self, a, b): num = len(a & b) den = len(a | b) return num / den if den != 0 else 1.0
def stem_words(self, words: List[str]) -> List[str]: """Stem list of words with PyStemmer.""" language_code = self.language_code() words = decode_object_from_bytes_if_needed(words) # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence # tokenization first) words = [word.replace("’", "'") for word in words] if language_code is None: raise McLanguageException("Language code is None.") if words is None: raise McLanguageException("Words to stem is None.") # (Re-)initialize stemmer if needed if self.__pystemmer is None: try: self.__pystemmer = PyStemmer(language_code) except Exception as ex: raise McLanguageException( "Unable to initialize PyStemmer for language '%s': %s" % (language_code, str(ex),) ) stems = self.__pystemmer.stemWords(words) if len(words) != len(stems): log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),)) # Perl's Snowball implementation used to return lowercase stems stems = [stem.lower() for stem in stems] return stems
class Stemmer(object): def __init__(self): # type: () -> None self.stemmer = PyStemmer('porter') def stem(self, word): # type: (unicode) -> unicode return self.stemmer.stemWord(word)
def __init__(self, language): """ Initializes attributes with the language provided. Args: language (str): The language used to stem ('french', 'english'). """ self.stemmer = Stemmer(language) self.stopwords = stopwords.words(language)
def __init__(self, featureName, featuresData, featureId): FeatureBooleanizer.__init__(self, featureName, featuresData, featureId) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') stopListFn = './resources/general/stopword.csv' self.stopList = frozenset(l for l in filter(None, map(lambda l: self.preprocess(l), open(stopListFn, 'rt').readlines()))) allWords = set() if self.featureName == 'Basic: Tagline': for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split(',')))) else: for row in featuresData: allWords |= set(map(lambda w: self.preprocess(w), filter(None, row[featureId].split()))) self.words = sorted(list(filter(None, allWords - self.stopList)))
def index(text, accepted_languages=None, langs=None): registry = get_current_registry() if accepted_languages == None: accepted_languages = [x.strip() for x in registry.settings["accepted_languages"].split("," )] if langs == None: lang = guessLanguage(text) if lang not in accepted_languages: langs = accepted_languages else: langs = [lang] langs = list(set(langs).intersection(set(accepted_languages))) if not langs: langs = accepted_languages indexed_words = set() for lang in langs: stemmer = Stemmer(lang) indexed_words.update([stemmer.stemWord(x.value) for x in tokenize(text)]) return indexed_words
def __init__(self, language='english', **kwargs): """ See here for a full list of languages: http://nltk.org/_modules/nltk/stem/snowball.html .. note:: This does not depend on nltk, it depends on the ``pystemmer`` package. :param language: language to use during stemming, defaults to english. """ Provider.__init__(self, **kwargs) self._stemmer = Stemmer(language)
class TextEater(object): def __init__(self): self.stoplist = gen_stops() self.stemmer = Stemmer('english') @coroutine def sent_filter(self,target): word = '' print "ready to eat lines" while True: sentence = (yield) target.send((sentence.lower()).split()) @coroutine def word_filter(self, target): print "ready to eat words" while True: raw = (yield) target.send([self.stemmer.stemWord(w) for w in raw if len(w)<=3 or w in self.stoplist]) @coroutine def ngrams(self,container, n=2,): "Compute n-grams" while True: grams= (yield) for i in range(0, len((grams)) - (n - 1)): container[(tuple(grams[i:i+n]))]+=1 @coroutine def printer(self): while True: line = (yield) print (line) @coroutine def typer(self,target): print "ready to check type" word = None while True: line = (yield word) word= type(line)
def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'): self.path = path + db + '/' self.words = query.split() self.doc_level_search = doc_level_search self.results = {} if doc_level_search: self.doc_path = self.path + 'doc_arrays/' else: self.doc_path = self.path + 'obj_arrays/' self.stemmer = stemmer if stemmer: try: from Stemmer import Stemmer self.stemmer = Stemmer(stemmer) # where stemmer is the language selected self.words = [self.stemmer.stemWord(word) for word in self.words] except KeyError: print >> sys.stderr, "Language not supported by stemmer. No stemming will be done." except ImportError: print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done."
class Overview(Feature): description = """ Basic: Overview """.strip() def __init__(self, *args, **kwargs): Feature.__init__(self) self.stemmer = Stemmer('english') self.goodChars = frozenset('abcdefghjiklmnopqrstuvwxyz0123456789') self.stopList = frozenset(['a', 'abaft', 'aboard', 'about', 'abov', 'absent', 'accord', 'account', 'across', 'addit', 'afor', 'after', 'against', 'ago', 'ahead', 'all', 'along', 'alongsid', 'alreadi', 'also', 'am', 'amid', 'amidst', 'among', 'amongst', 'an', 'and', 'anenst', 'ani', 'anoth', 'anybodi', 'anyhow', 'anyon', 'anyth', 'anywher', 'apart', 'apr', 'april', 'apropo', 'apud', 'are', 'around', 'as', 'asid', 'astrid', 'at', 'athwart', 'atop', 'aug', 'august', 'back', 'bad', 'bar', 'be', 'becaus', 'been', 'befor', 'begin', 'behalf', 'behest', 'behind', 'below', 'beneath', 'besid', 'best', 'better', 'between', 'beyond', 'big', 'bigger', 'biggest', 'billion', 'blah', 'bln', 'both', 'but', 'by', 'c', 'ca', 'call', 'can', 'cannot', 'cant', 'case', 'circa', 'close', 'concern', 'could', 'couldt', 'current', 'daili', 'day', 'dec', 'decemb', 'despit', 'did', 'do', 'doe', 'doesnt', 'done', 'dont', 'down', 'due', 'dure', 'each', 'eight', 'eighteen', 'eighth', 'eighti', 'eleven', 'end', 'enough', 'ever', 'except', 'exclud', 'fail', 'far', 'feb', 'februari', 'few', 'fifth', 'first', 'five', 'fiveteen', 'fivti', 'follow', 'for', 'forenenst', 'four', 'fourteen', 'fourth', 'fourti', 'fri', 'friday', 'from', 'front', 'full', 'further', 'get', 'given', 'go', 'gone', 'goot', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'he', 'her', 'here', 'herself', 'high', 'higher', 'hightst', 'himself', 'his', 'how', 'hunderd', 'i', 'if', 'in', 'includ', 'insid', 'instead', 'into', 'is', 'it', 'itself', 'jan', 'januari', 'jul', 'juli', 'jun', 'june', 'just', 'last', 'late', 'later', 'latest', 'left', 'lest', 'lieu', 'like', 'littl', 'long', 'low', 'lower', 'lowest', 'made', 'make', 'mani', 'mar', 'march', 'may', 'me', 'mean', 'mid', 'midst', 'might', 'milliard', 'million', 'mine', 'minus', 'mld', 'mln', 'modulo', 'mon', 'monday', 'month', 'more', 'most', 'mth', 'much', 'must', 'my', 'myself', 'near', 'need', 'neednt', 'neither', 'never', 'next', 'nine', 'nineteen', 'nineth', 'nineti', 'no', 'none', 'nor', 'not', 'notwithstand', 'nov', 'novemb', 'number', 'o', 'oct', 'octob', 'of', 'off', 'on', 'one', 'onli', 'onto', 'oppos', 'opposit', 'or', 'order', 'other', 'ought', 'our', 'ourselv', 'out', 'outsid', 'over', 'owe', 'pace', 'past', 'per', 'place', 'plus', 'point', 'previous', 'prior', 'pro', 'pursuant', 'put', 'qua', 'rather', 'recent', 'regard', 'regardless', 'respect', 'right', 'round', 'said', 'sake', 'same', 'san', 'sat', 'saturday', 'save', 'saw', 'say', 'second', 'see', 'seen', 'sep', 'septemb', 'seven', 'seventeen', 'seventh', 'seventi', 'sever', 'shall', 'she', 'should', 'shouldnt', 'show', 'shown', 'sinc', 'six', 'sixteen', 'sixth', 'sixti', 'small', 'smaller', 'smallest', 'so', 'some', 'somebodi', 'somehow', 'someon', 'someth', 'somewher', 'soon', 'sooner', 'spite', 'start', 'still', 'subsequ', 'such', 'sun', 'sunday', 'take', 'taken', 'tell', 'ten', 'tenth', 'than', 'thank', 'that', 'the', 'their', 'them', 'themselv', 'there', 'these', 'they', 'third', 'thirteen', 'thirti', 'this', 'those', 'thousand', 'three', 'through', 'throughout', 'thru', 'thruout', 'thu', 'thursday', 'till', 'time', 'to', 'today', 'told', 'too', 'took', 'top', 'toward', 'tue', 'tuesday', 'twelv', 'twenti', 'two', 'under', 'underneath', 'unit', 'unlik', 'until', 'unto', 'up', 'upon', 'us', 'use', 'versus', 'via', 'vice', 'view', 'virtu', 'vis', 'visavi', 'vs', 'was', 'we', 'wed', 'wednesday', 'week', 'well', 'went', 'were', 'what', 'when', 'where', 'whether', 'whi', 'which', 'while', 'who', 'whose', 'will', 'with', 'within', 'without', 'wont', 'wors', 'worst', 'worth', 'would', 'wrt', 'xor', 'year', 'yes', 'yesterday', 'yet', 'you', 'your', 'yourself', 'yourselv', 'yr']) def preprocess(self, s): chars = [] for c in unidecode(s.strip().lower()): if c in self.goodChars: chars.append(c) word = ''.join(chars) return self.stemmer.stemWord(word) def extract(self, m): t = m.overview return ','.join(sorted(list(set(filter(lambda w: len(w) > 0 and w not in self.stopList, map(self.preprocess, t.split()))))))
from Stemmer import Stemmer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords as nltk_stopwords from sets import Set import pdb from config import * import string STOPWORDS = Set(nltk_stopwords.words('english')) URL_STOP_WORDS = Set([ "http", "https", "www", "ftp", "com", "net", "org", "archives", "pdf", "html", "png", "txt", "redirect" ]) STEMMER = Stemmer('english') LEMMATIZER = WordNetLemmatizer() EXTENDED_PUNCTUATIONS = Set(list(string.punctuation) + ['\n', '\t', " "]) INT_DIGITS = Set(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]) MAX_WORD_LEN = 10 MIN_WORD_LEN = 3 def isEnglish(s): try: s.encode(encoding='utf-8').decode('ascii') except UnicodeDecodeError: return False else: return True
class Stemmer(object): def __init__(self): self.stemmer = PyStemmer('porter') def stem(self, word): return self.stemmer.stemWord(word)
import re import time import math import operator from nltk.corpus import stopwords import os import pickle from Stemmer import Stemmer stop_words = set(stopwords.words('english')) ps = Stemmer('porter') root_path = "/Users/rishabhmurarka/Desktop/3rd SEm/IRE/Phase_2/" def isASCII(word): """ removing non-ascii charcaters from a string. If the word is a ascii character, then only include it into the list or dictionary or even file :param word: the string which is to be checked for non-ascii presence :return: True if the word is an ascii character ; otherwise false """ try: word = word.encode('ascii') except UnicodeEncodeError: return False return True def preprocessing(unprocessed_data): """
import train import Body from Stemmer import Stemmer import codecs import nltk import re import math import random from nltk.corpus import stopwords stop_words = stopwords.words('farsi') stop_words.append(".") stop_words.append(":") stop_words.append("،") import operator import glob stemmer = Stemmer() def sigmoid(x): return (1 / (1 + math.exp(-x))) def words_list(paragraph, stemmed_sent): ''' takes a text as an input and reduce it to embbeded list of stemmed-words of sentences (without stop words)''' fp = codecs.open(paragraph, 'r', 'utf8') txt = fp.read() fp.close() sentences = txt.split('\r\n') #print('sentences', sentences) embbeded = [] for line in sentences:
def merge_d(d1, d2): union = {} for key in set(d1.keys()).union(d2.keys()): union[key] = [] if key in d1 and key not in d2: # if the key is only in d1 union[key] = d1[key] if key in d2 and key not in d1: union[key] = d2[key] if key in d1 and key in d2: union[key] = d1[key] + "+" + d2[key] return union startTime = datetime.now() stop_words = get_stop_words('en') p_stemmer = Stemmer('english') start = 900 end = 902 c = open('count_list', 'r+') lines = c.readlines() c.close() for i in range(start, end): all_dicts = [] pair = lines[i].split(':') file_num = int(pair[0]) + 1 count = int(pair[1].split(',')[0].strip().replace('[', '')) + 1 i_count = count input_file = "./output/output_%d" % file_num output_file = "./indices/index_%d" % file_num
class Help(Processor): usage = u""" what can you do|help help me with <category> how do I use <feature> help <(category|feature)> """ feature = ('help', ) stemmer = Stemmer('english') def _get_features(self): """Walk the loaded processors and build dicts of categories and features in use. Dicts are cross-referenced by string. """ categories = {} for k, v in ibid.categories.iteritems(): v = copy(v) v.update({ 'name': k, 'features': set(), }) categories[k] = v features = {} processor_modules = set() for processor in ibid.processors: for feature in getattr(processor, 'feature', []): if feature not in features: features[feature] = { 'name': feature, 'description': None, 'categories': set(), 'processors': set(), 'usage': [], } features[feature]['processors'].add(processor) if hasattr(processor, 'usage'): features[feature]['usage'] += [ line.strip() for line in processor.usage.split('\n') if line.strip() ] processor_modules.add(sys.modules[processor.__module__]) for module in processor_modules: for feature, meta in getattr(module, 'features', {}).iteritems(): if feature not in features: continue if meta.get('description'): features[feature]['description'] = meta['description'] for category in meta.get('categories', []): features[feature]['categories'].add(category) categories[category]['features'].add(feature) categories = dict( (k, v) for k, v in categories.iteritems() if v['features']) usere = re.compile(r'[\s()[\]<>|]+') for name, feat in features.iteritems(): feat['usage_keywords'] = frozenset( self.stemmer.stemWord(word.strip()) for word in usere.split(u' '.join(feat['usage'])) if word.strip()) for name, cat in categories.iteritems(): cat['description_keywords'] = frozenset( self.stemmer.stemWord(word) for word in cat['description'].lower().split()) for name in features.keys(): st_name = self.stemmer.stemWord(name) features[st_name] = features[name] if st_name != name: del features[name] for name in categories.keys(): st_name = self.stemmer.stemWord(name) categories[st_name] = categories[name] if st_name != name: del categories[name] return categories, features def _describe_category(self, event, category): """Respond with the help information for a category""" event.addresponse( u'I use the following features for %(description)s: ' u'%(features)s\n' u'Ask me "how do I use ..." for more details.', { 'description': category['description'].lower(), 'features': human_join(sorted(category['features'])), }, conflate=False) def _describe_feature(self, event, feature): """Respond with the help information for a feature""" output = [] desc = feature['description'] if desc is None: output.append(u'You can use it like this:') elif len(desc) > 100: output.append(desc) output.append(u'You can use it like this:') elif desc.endswith('.'): output.append(desc + u' You can use it like this:') else: output.append(desc + u'. You can use it like this:') for line in feature['usage']: output.append(u' ' + line) event.addresponse(u'\n'.join(output), conflate=False) def _usage_search(self, event, terms, features): terms = frozenset(self.stemmer.stemWord(term) for term in terms) results = set() for name, feat in features.iteritems(): if terms.issubset(feat['usage_keywords']): results.add(name) results = sorted(results) if len(results) == 1: self._describe_feature(event, features[results[0]]) elif len(results) > 1: event.addresponse( u"Please be more specific. I don't know if you mean %s", human_join((features[result]['name'] for result in results), conjunction=u'or')) else: event.addresponse( u"I'm afraid I don't know what you are asking about. " u'Ask "what can you do" to browse my features.') @match(r'^(?:help|features|what\s+(?:can|do)\s+you\s+do)$') def intro(self, event): categories, features = self._get_features() categories = filter(lambda c: c['weight'] is not None, categories.itervalues()) categories = sorted(categories, key=lambda c: c['weight']) event.addresponse( u'I can help you with: %s.\n' u'Ask me "help me with ..." for more details.', human_join(c['description'].lower() for c in categories), conflate=False) @match(r'^help\s+(?:me\s+)?with\s+(.+)$') def describe_category(self, event, terms): categories, features = self._get_features() termset = frozenset( self.stemmer.stemWord(term) for term in terms.lower().split()) if len(termset) == 1: term = list(termset)[0] exact = [c for c in categories.itervalues() if c['name'] == term] if exact: self._describe_category(event, exact[0]) return results = [] for name, cat in categories.iteritems(): if termset.issubset(cat['description_keywords']): results.append(name) if len(results) == 0: for name, cat in categories.iteritems(): if terms.lower() in cat['description'].lower(): results.append(name) results.sort() if len(results) == 1: self._describe_category(event, categories[results[0]]) return elif len(results) > 1: event.addresponse( u"Please be more specific, I don't know if you mean %s.", human_join( ('%s (%s)' % (categories[r]['description'].lower(), r) for r in results), conjunction=u'or')) return event.addresponse( u"I'm afraid I don't know what you are asking about. " u'Ask "what can you do" to browse my features.') @match(r'^(?:help|usage|modinfo)\s+(\S+)$') def quick_help(self, event, terms): categories, features = self._get_features() terms = frozenset(terms.lower().split()) if len(terms) == 1: term = list(terms)[0] exact = [c for c in categories.itervalues() if c['name'] == term] if exact: self._describe_category(event, exact[0]) return exact = [f for f in features.itervalues() if f['name'] == term] if exact: self._describe_feature(event, exact[0]) return self._usage_search(event, terms, features) @match(r'^how\s+do\s+I(?:\s+use)?\s+(.+)$') def describe_feature(self, event, feature): categories, features = self._get_features() feature = feature.lower() exact = [f for f in features.itervalues() if f['name'] == feature] if exact: self._describe_feature(event, exact[0]) else: self._usage_search(event, frozenset(feature.split()), features) @match(r'^\s*(?:help\s+me\s+with|how\s+do\s+I(?:\s+use)?)\s+\.\.\.\s*$', version='deaddressed') def silly_people(self, event): event.addresponse( u'You must replace the ellipsis with the thing you are after')
import zlib from collections import * import xml.etree.cElementTree as et import re import os from Stemmer import Stemmer import time wikiFilePath = input("Please Enter path to wiki XML file :\n") current_directory = os.getcwd() baseDirectory = os.path.join(current_directory, r'TemporaryIndex/') if not os.path.exists(baseDirectory): os.makedirs(baseDirectory) start_time1 = time.time() stemmer = Stemmer("english") pattern = re.compile("[^a-zA-Z]") # pattern for splitting text stop_words = {} # words that are not significant stop_words_file = open("Stop_words.txt", "r") content = stop_words_file.read() content = re.split(",", content) for word in content: if word: stop_words[word] = True words_index = defaultdict(list) inTitle = 0 # indicator for title hit inSubTitle = 1 # indicator for sub title hit inCategory = 2 # indicator for category hit inText = 3 # indicator for text hit
file = open( "C:\\Users\Administrator\\Desktop\\myfolder\\corpora\\stats\\ielts-7to11-some.txt" ) raw = file.read() try: wordlist = nltk.word_tokenize(raw) lemmatizer = WordNetLemmatizer() print lemmatizer.lemmatize("ran") lanster = LancasterStemmer() porter = PorterStemmer() snowball = SnowballStemmer("english") isri = ISRIStemmer() rslp = RSLPStemmer() porter2 = Stemmer('english') endOfString = StringEnd() prefix = oneOf( "uni inter intro de con com anti pre pro per an ab ad af ac at as re in im ex en em un dis over sub syn out thermo philo geo for fore back" ) suffix = oneOf("ish") #suffix = oneOf("or er ed ish ian ary ation tion al ing ible able ate ly ment ism ous ness ent ic ive " # "ative tude ence ance ise ant age cide ium ion") word = (Optional(prefix)("prefixes") + SkipTo(suffix | suffix + FollowedBy(endOfString) | endOfString)("root") + ZeroOrMore(suffix | suffix + FollowedBy(endOfString))("suffix")) #word = (Optional(prefix)("prefixes") + SkipTo(FollowedBy(endOfString))("root"))
def stemmer(self): if not hasattr(self, '_stemmer'): from Stemmer import Stemmer self._stemmer = Stemmer(self.lang) return self._stemmer
def stemmer(listofTokens): #Stemming stemmer=Stemmer("english") stemmedWords=[ stemmer.stemWord(key) for key in listofTokens ] return stemmedWords
class Lemmatizer(object): def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True): self.verbs = {} self.stemmer = Stemmer() tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file) self.words = tokenizer.words if verbs_file: self.verbs['است'] = '#است' for verb in tokenizer.verbs: for tense in self.conjugations(verb): self.verbs[tense] = verb if joined_verb_parts: for verb in tokenizer.verbs: bon = verb.split('#')[0] for after_verb in tokenizer.after_verbs: self.verbs[bon + 'ه_' + after_verb] = verb self.verbs['ن' + bon + 'ه_' + after_verb] = verb for before_verb in tokenizer.before_verbs: self.verbs[before_verb + '_' + bon] = verb def lemmatize(self, word, pos=''): if not pos and word in self.words: return word if (not pos or pos == 'V') and word in self.verbs: return self.verbs[word] if pos.startswith('AJ') and word[-1] == 'ی': return word if pos == 'PRO': return word if word in self.words: return word stem = self.stemmer.stem(word) if stem and stem in self.words: return stem return word def conjugations(self, verb): past, present = verb.split('#') ends = ['م', 'ی', '', 'یم', 'ید', 'ند'] if verb == '#هست': return ['هست' + end for end in ends] + ['نیست' + end for end in ends] past_simples = [past + end for end in ends] past_imperfects = ['می' + item for item in past_simples] ends = ['هام', 'های', 'ه', 'هایم', 'هاید', 'هاند'] past_narratives = [past + end for end in ends] imperatives = ['ب' + present, 'ن' + present] if present.endswith('ا') or present in ('آ', 'گو'): present = present + 'ی' ends = ['م', 'ی', 'د', 'یم', 'ید', 'بودم', 'ند'] present_simples = [present + end for end in ends] present_imperfects = ['می' + item for item in present_simples] present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples] present_not_subjunctives = ['ن' + item for item in present_simples] with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items)) aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if \ items[0].startswith('آ') else items return aa_refinement( with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots( past_narratives) + with_nots(present_simples) + with_nots( present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives)
class Searcher(object): """Run a search on documents or objects within documents in the SQLite table Three scoring options are available: Frequency, TF-IDF and BM25 Two methods of incrementing the scores of results are available: simple addition or best score""" def __init__(self, query, db, doc_level_search=True, stemmer=False, path='/var/lib/philologic/databases/'): self.path = path + db + '/' self.words = query.split() self.doc_level_search = doc_level_search self.results = {} if doc_level_search: self.doc_path = self.path + 'doc_arrays/' else: self.doc_path = self.path + 'obj_arrays/' self.stemmer = stemmer if stemmer: try: from Stemmer import Stemmer self.stemmer = Stemmer(stemmer) # where stemmer is the language selected self.words = [self.stemmer.stemWord(word) for word in self.words] except KeyError: print >> sys.stderr, "Language not supported by stemmer. No stemming will be done." except ImportError: print >> sys.stderr, "PyStemmer is not installed on your system. No stemming will be done." def get_hits(self, word, doc=True): """Query the SQLite table and return a list of tuples containing the results""" cursor = sqlite_conn(self.path + 'hits_per_word.sqlite') if self.doc_level_search: cursor.execute('select doc_id, word_freq, total_words from doc_hits where word=?', (word,)) else: cursor.execute('select obj_id, word_freq, total_words from obj_hits where word=?', (word,)) return cursor.fetchall() def id_to_word(self, id): """Return the word given its ID""" m = mapper(self.path) return m[id] def get_idf(self, hits): """Return IDF score""" total_docs = doc_counter(self.doc_path) #### WRONG COUNT try: return log(float(total_docs) / float(len(hits))) + 1 except ZeroDivisionError: return 0 def search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10): """Searcher function""" self.intersect = False if self.words != []: for word in self.words: hits = self.get_hits(word) getattr(self, measure)(hits, scoring) if intersect: if self.intersect: self.docs = self.docs.intersection(self.new_docs) self.new_docs = set([]) else: self.intersect = True self.docs = set([obj_id for obj_id in self.results]) self.new_docs = set([]) if intersect: self.results = dict([(obj_id, self.results[obj_id]) for obj_id in self.results if obj_id in self.docs]) return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display] else: return [] def debug_score(self, hits, scoring): for obj_id, word_freq, word_sum in hits: getattr(self, scoring)(obj_id, word_freq) def tf_idf(self, hits, scoring): idf = self.get_idf(hits) for obj_id, word_freq, word_sum in hits: tf = float(word_freq) / float(word_sum) score = tf * idf getattr(self, scoring)(obj_id, score) def frequency(self, hits, scoring): for obj_id, word_freq, word_sum in hits: score = float(word_freq) / float(word_sum) getattr(self, scoring)(obj_id, score) def bm25(self, hits, scoring, k1=1.2, b=0.75): ## a floor is applied to normalized length of doc ## in order to diminish the importance of small docs ## see http://xapian.org/docs/bm25.html idf = self.get_idf(hits) avg_dl = avg_doc_length(self.path) for obj_id, word_freq, obj_length in hits: tf = float(word_freq) dl = float(obj_length) temp_score = tf * (k1 + 1.0) temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl)) score = idf * temp_score / temp_score2 getattr(self, scoring)(obj_id, score) def simple_scoring(self, obj_id, score): if self.intersect: self.new_docs.add(obj_id) if obj_id not in self.results: self.results[obj_id] = score else: self.results[obj_id] += score def dismax_scoring(self, obj_id, score): if self.intersect: self.new_docs.add(obj_id) if obj_id not in self.results: self.results[obj_id] = score else: if score > self.results[obj_id]: self.results[obj_id] = score def lda_search(self, measure='tf_idf', scoring='simple_scoring', intersect=False, display=10): """Searcher function""" self.intersect = False self.words = [words.decode('utf-8') for words in self.words] if self.words != []: lda_query = self.match_topic() if lda_query != None: for word in self.words[:1]: # temporary slice, to offer it as an option? lda_query[word] = sum([lda_query[term] for term in lda_query]) print lda_query self.num_hits = {} for other_word, freq in lda_query.iteritems(): hits = self.get_hits(other_word) results = self.lda_scoring(hits, scoring, freq, measure) self.results = dict([(obj_id, self.results[obj_id] * self.num_hits[obj_id]) for obj_id in self.results if self.num_hits[obj_id] > 1]) return sorted(self.results.iteritems(), key=itemgetter(1), reverse=True)[:display] else: return [] else: return [] def match_topic(self): topic_id = int cursor = sqlite_conn(self.path + 'lda_topics.sqlite') if len(self.words) == 1: cursor.execute('select topic, position from word_position where word=? order by position', (self.words[0],)) try: topic_id = cursor.fetchone()[0] except TypeError: return None else: topic_pos = {} topic_matches = {} query = 'select topic, position from word_position where word="%s"' % self.words[0] for word in self.words[1:]: query += ' or word="%s"' % word cursor.execute(query) for topic, position in cursor.fetchall(): if topic not in topic_pos: topic_pos[topic] = position topic_matches[topic] = 1 else: topic_pos[topic] += position topic_matches[topic] += 1 word_num = len(self.words) topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num] if topics == []: topics = [(topic, topic_pos[topic]) for topic in topic_pos if topic_matches[topic] == word_num - 1] topic_id = sorted(topics, key=itemgetter(1))[0][0] cursor.execute('select words from topics where topic=?', (topic_id,)) results = json.loads(cursor.fetchone()[0]) topic = [(term, float(freq)) for term, freq in results.iteritems()]# if float(freq) > 0.01] topic = dict(sorted(topic, key=itemgetter(1), reverse=True)[:10]) return topic def lda_scoring(self, hits, scoring, freq, measure): if measure == 'tf_idf': idf = self.get_idf(hits) for obj_id, word_freq, word_sum in hits: tf = float(word_freq) / float(word_sum) score = tf * idf * freq if obj_id not in self.results: self.results[obj_id] = score self.num_hits[obj_id] = 1 else: self.results[obj_id] += score self.num_hits[obj_id] += 1 else: idf = self.get_idf(hits) avg_dl = avg_doc_length(self.path) k1 = 1.2 b = 0.75 for obj_id, word_freq, obj_length in hits: tf = float(word_freq) dl = float(obj_length) temp_score = tf * (k1 + 1.0) temp_score2 = tf + k1 * ((1.0 - b) + b * floor(dl / avg_dl)) score = idf * temp_score / temp_score2 * freq if obj_id not in self.results: self.results[obj_id] = score self.num_hits[obj_id] = 1 else: self.results[obj_id] += score self.num_hits[obj_id] += 1
def merge_d(d1, d2): union = {} for key in set(d1.keys()).union(d2.keys()): union[key] = [] if key in d1 and key not in d2: # if the key is only in d1 union[key] = d1[key] if key in d2 and key not in d1: union[key] = d2[key] if key in d1 and key in d2: union[key] = d1[key] + "+" + d2[key] return union startTime = datetime.now() stop_words = get_stop_words('en') p_stemmer = Stemmer('english') start = 900 end = 910 c = open('count_list', 'r+') lines = c.readlines() c.close() for i in range(start, end): all_dicts = [] pair = lines[i].split(':') file_num = int(pair[0]) + 1 count = int(pair[1].split(',')[0].strip().replace('[', '')) + 1 i_count = count input_file = "./output/output_%d" % file_num output_file = "./indices/index_%d" % file_num
from __future__ import print_function import xml.etree.ElementTree as etree import re, os, heapq, math, operator, string, time, sys from collections import * from Stemmer import Stemmer as PyStemmer import glob reload(sys) sys.setdefaultencoding('utf-8') ps = PyStemmer('porter') if (len(sys.argv[1:]) < 1): print("Needs 1 argument, the index directory") sys.exit() indexDirPth = sys.argv[1] # qryTxtFlPth = sys.argv[2] # outTxtFlPth = sys.argv[3] # if not os.path.exists(outTxtFlPth): # with open(outTxtFlPth, 'w+'): pass # else: # open(outTxtFlPth, 'w').close() absltPthCurrPrgrm = os.path.abspath(os.path.dirname(sys.argv[0])) ########################################################################### stopwords = dict() inverted_index_file, mapping, doc_offset = list(), list(), list() inverted_index_file.append( open(os.path.join(indexDirPth, 'title/final.txt'), 'r'))
loc = mid break else: loc = mid break mid = (start + end) // 2 if numbers[mid] == key: return mid if numbers[mid] < key: return mid return mid -1 stemObj=Stemmer('porter') from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer("\w+|\$[\d\.]+|\S+") # tokenize text def tokenizeText(textInput): normalized=[] #textInput=removeURL(textInput) #tokens = re.findall(r"\w+(?:'\w+)?|[^\w\s]", textInput) tokens=re.split(r'[^A-Za-z0-9]+',textInput) #tokens = [x for x in tokens if re.match(r"^[a-z]+$", x.lower())] for token in tokens: token=token.lower() token=token.lstrip('0') word=stemObj.stemWord(token) if word in STOP_WORDS or len(word)<=1: continue
if (word_dict[word][page][3]): arr.append('i' + str(word_dict[word][page][3])) if (word_dict[word][page][4]): arr.append('r' + str(word_dict[word][page][4])) if (word_dict[word][page][5]): arr.append('e' + str(word_dict[word][page][5])) line = "".join(arr) f.write((line + '\n')) f.close() f = open("./titles/title_" + str(sys.argv[2]) + ".txt", 'w', encoding="utf-8") for title in titles: f.write(title + '\n') f.close() start = time.time() reg1 = re.compile('[A-Za-z0-9]+') reg2 = re.compile("\[\[Category:(.*)\]\]") reg3 = re.compile("\[.*?\]") reg4 = re.compile('\{\{\s*Infobox ((.*?\n)*?) *?\s*\}\}') ps = Stemmer("porter") parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) Handler = PageHandler() parser.setContentHandler(Handler) parser.parse(("./Data/" + sys.argv[1])) write_to_disk(Handler.word_dict, Handler.titles) print(time.time() - start) print(Handler.page_no)
"""Text Processing """ import re from collections import defaultdict from Stemmer import Stemmer STOP_WORDS = defaultdict(int) FP = open("stop_words.txt", "r") for l in FP: l = l.strip() l = l.lower() STOP_WORDS[l] = 1 FP.close() STEMMER = Stemmer("english") TAGS = [ "<sup>", "#REDIRECT", "format=", "dts", "dmy", "colspan", "</sup>", "<big>", "</big>", "<small>", "</small>", "</tr>", "<br>", "<br />", "<center>", "</center", "</abbr>", "<abbr", "<code>", "</code>", "<div>", "</div>", "<imagemap>", "</imagemap>", "<gallery>", "</gallery>" ] NOT_BODY = [ "==See also==", "== See also ==", "== References ==", "==References and sources==", "==References==", "== Bibliography ==", "==External links==", "== External links ==", "{{Infobox", "[[Category" ] def stem(sentence): """Stems the sentence"""
def lmScoring( sentence ): # candidates is the list of candiate sentences formed by trying # all possible definitions of all words with >1 translation stemmer = Stemmer() stemmer.DICT = dict candidates = [] tokens = asTokens( sentence ) for i in range( len(tokens) ): word = tokens[i] if word.lower() in dict: translations = dict[word.lower()] pos = POSTAG[word.lower()] # print 'word:',word,', pos:',pos,', dictionary:',translations if pos == 'V': try: stemmer_translations = stemmer.input([word.lower()]) # print 'stemmer returned: ',stemmer_translations if stemmer_translations: translations = [stemmer_translations] except: pass # print 'stemmer threw exception on: ', word.lower() old_candidates = candidates[:] candidates = [] # print 'old_candidates:', old_candidates k = len(translations) if k > 1: # for idx in range(len(candidates)): # for t in range(len(translations)): if len(old_candidates) == 0: for k in range(len(translations)): candidates.append( [translations[k]] ) else: for k in range(len(translations)): for c in old_candidates: # print 'c in old_candidates:',c cnew = c + [translations[k]] # print cnew candidates.append( cnew ) else: # append the current word to all candidate # sentences if len(old_candidates) == 0: candidates.append( [translations[0]] ) else: for c in old_candidates: # print 'c in old_candidates:',c cnew = c + [translations[0]] # print cnew candidates.append( cnew ) # print [c.extend(translations[0]) for c in old_candidates] # candidates.extend( [c.extend(translations[0]) for c in old_candidates] ) else: # print 'CANDIDATES (',len(candidates),')' # print candidates # print word, "NOT IN DICTIONARY" # words not in dictionary pass through untranslated translations = [word] old_candidates = candidates[:] candidates = [] if len(old_candidates) == 0: candidates.append( [translations[0]] ) else: for c in old_candidates: cnew = c + [translations[0]] candidates.append( cnew ) # print 'CANDIDATES (',len(candidates),')' # print candidates neglobprob = [lm.sentenceProbability( ' '.join(cs) ) for cs in candidates ] # print neglobprob bestSentence = candidates[ neglobprob.index( min(neglobprob) ) ] # print 'CANDIDATES (',len(candidates),')' # for c in candidates: # print ' '.join(c) # print 'bestSentence=' # print ' '.join(bestSentence) return ' '.join(bestSentence)
from Stemmer import Stemmer import sys import re, os import math from collections import defaultdict from copy import deepcopy import subprocess st = Stemmer('english') pattern = re.compile( r'[\d+\.]*[\d]+|[^\w]+' ) #pattern to detect numbers (real/integer) non alphanumeric (no underscore) Summary = [] lamda = 6 alpha = 0.75 #stopword dictionary from "stopwords.txt" file stopWordDict = defaultdict(int) stopWordFile = open("./stopwords.txt", "r") for line in stopWordFile: stopWordDict[line.strip()] = 1 def extractDocumentCorpus(folder): os.chdir(folder) print folder document_to_senctence_corpus = {} for each_file in os.listdir('.'): print each_file fileptr = open(each_file, 'r')
import json import pickle as pkl import requests from requests import utils import ast from nltk.corpus import stopwords from spacy.lang.en.stop_words import STOP_WORDS stop_words = set(stopwords.words("english")) requests.packages.urllib3.disable_warnings() from nltk.stem import SnowballStemmer, PorterStemmer from Stemmer import Stemmer en_stemmer = SnowballStemmer('english') porter_stemmer = PorterStemmer() port_pystemmer = Stemmer('porter') en_pystemmer = Stemmer('english') stem_words = {} words_dict = {} title_dict = {} total_num_tokens = 0 hindi_places_data = {} with open('places_dataset_3.json') as f: infot = json.load(f) mapping = { info['hi_wikipedia_title']: ind for ind, info in enumerate(infot['data']) } for info in infot['data']: hindi_places_data[info['hi_wikipedia_title']] = info['wd_id']
def __init__(self): self.stemmer = PyStemmer("porter")
from nltk.corpus import stopwords #from nltk.stem import PorterStemmer from Stemmer import Stemmer from string import punctuation from nltk.tokenize import wordpunct_tokenize import time import sys import errno import heapq import shutil stop_words = set(stopwords.words('english')) stop_words.update(list(char for char in punctuation)) stemmer = Stemmer('english') text_punc = list(punc for punc in punctuation if punc not in ['{', '}', '=', '[', ']']) text_punc.append('\n') # words_left = ['{', '}', '=', '[', ']' ] def writing_to_file(Inverted_Index, File_count, file_path): path_to_write = os.path.join(file_path, str(File_count) + '.txt') #print("File",str(File_count)) value = list() file_pointer = open(path_to_write, 'w+') for term in sorted(Inverted_Index):
def __init__(self): self.stoplist = gen_stops() self.stemmer = Stemmer('english')
from __future__ import print_function import xml.etree.ElementTree as etree import re, sys, os, heapq, math from collections import * from Stemmer import Stemmer as PyStemmer import glob reload(sys) sys.setdefaultencoding('utf-8') ps = PyStemmer('porter') if (len(sys.argv[1:]) < 2): print("needs 3 arguments") sys.exit() pathWikiXML = sys.argv[1].strip() outputDirPth = sys.argv[2].strip() if not os.path.exists(outputDirPth): os.makedirs(outputDirPth) absltPthCurrPrgrm = os.path.abspath(os.path.dirname(sys.argv[0])) # print("existential question ",os.path.exists(outputDirPth)) # print(pathWikiXML) # file = sys.argv[0] # pathname = os.path.dirname(file) ########################################################################## stopwords, allwords = dict(), dict() prntLst = ['t', 'p', 'c'] dir_names = ["title", "text", "category"]
def __init__(self): # type: () -> None self.stemmer = PyStemmer('porter')
def __init__(self): self.stemmer = PyStemmer('porter')
def __setstate__(self, state): self.stemmer = Stemmer('russian') self.word_to_idx, self.idx_to_word = state
import nltk nltk.download('stopwords') from Stemmer import Stemmer from nltk.corpus import stopwords from collections import OrderedDict from pathlib import Path import os import bisect import math import sys import nltk from Stemmer import Stemmer from collections import Counter PageCount=0 stemmer = Stemmer('porter') stop_words = stopwords.words('english') stopwords_dict = Counter(stop_words) nltk.download('stopwords') SecondaryIndex=[] WordPageFreq={} WordPageId={} WordIdf={} WordTfIdf={} IdTitles={} TopKwords=0 stemmer = Stemmer('porter') def getTitles(): f=open("indexfiles/titles.txt","r")
class StemCorpus(Corpus): def __init__(self): super().__init__() self.stemmer = Stemmer('russian') def __getstate__(self): return self.word_to_idx, self.idx_to_word def __setstate__(self, state): self.stemmer = Stemmer('russian') self.word_to_idx, self.idx_to_word = state def encode_word(self, word): stem_form = self.stemmer.stemWord(word.lower()) return self.word_to_idx.get(stem_form, len(self.idx_to_word) - 1) def build(self, sentences, vocabulary_size=50000, log_every=100000): print('= Start building vocabulary') vocab = defaultdict(int) saved_sentences = [] for i, s in enumerate(sentences, 1): line = s.lower().split() for tok in line: if tok in PUNKT_TAGS: continue stem_form = self.stemmer.stemWord(tok.lower()) vocab[stem_form] += 1 if i % log_every == 0: print('--- Processed {} sentences'.format(i)) saved_sentences.append(line) print('= Built vocabulary with size {}'.format(len(vocab))) if vocabulary_size < len(vocab): print('= Trim it to {}'.format(vocabulary_size)) word_freq = list( map(itemgetter(0), sorted(vocab.items(), key=_freq_sorter, reverse=True))) word_freq = word_freq[:vocabulary_size] print('Top 10 most frequent words: {}'.format(', '.join( word_freq[:10]))) print('Top 10 least frequent words: {}'.format(', '.join( word_freq[-10:]))) print('= Building word to index mapping') if Tag.NUM not in word_freq: word_freq[-2] = Tag.NUM if Tag.ENG not in word_freq: word_freq[-1] = Tag.ENG assert Tag.EOS not in word_freq word_freq.append(Tag.EOS) assert Tag.UNK not in word_freq word_freq.append(Tag.UNK) self.idx_to_word.clear() self.word_to_idx.clear() for w in word_freq: self.word_to_idx[w] = len(self.idx_to_word) self.idx_to_word.append(w) print('= Built mappings') print('idx_to_word size = {}, word_to_idx size = {}'.format( len(self.idx_to_word), len(self.word_to_idx)))
def stemmer(listofTokens): #Stemming stemmer = Stemmer("english") stemmedWords = [stemmer.stemWord(key) for key in listofTokens] return stemmedWords
def __init__(self): super().__init__() self.stemmer = Stemmer('russian')
from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.pipeline import FeatureUnion from gensim.models import Word2Vec from sklearn.decomposition import PCA from matplotlib import pyplot import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from keras.models import Sequential from keras.layers import Dense from keras.utils import np_utils import pandas from Stemmer import Stemmer stem = Stemmer('english') stop_words = ['.', ','] stop_words = set(stopwords.words('english')) stop_words.add('.') stop_words.add('I') f = open('../../data/processed_data.txt', 'r') q = open('../../data/queries.txt', 'r') o = open('../../data/options.txt', 'r') a = open('../../data/answers.txt', 'r') tmp = open('tmp.txt', 'wa+') nouns = [] WINDOW = 20 TEST_SIZE = 40000 data = ""
from heapq import heappush, heappop import sys from math import log10 from pympler.asizeof import asizeof # In[3]: stop_words = set(stopwords.words('english')) def stopWords(listOfWords): #Stop Words Removal temp = [key for key in listOfWords if key not in stop_words] return temp ps = Stemmer("english") def myTokenizer(text): words = re.split(r'(\b[^-\s]+\b)((?<=\.\w).)?', text) tok = [i for i in words if i != None and i != " " and i != ""] tok = [ word.lower() for word in tok if re.match('^[a-zA-Z0-9\'-.]+$', word) and not re.match('^[\',-_]+$', word) and not re.match('^[^\w]+$', word) ] fin_tok = [] for t in tok: fin_tok.append(re.sub("[\+*=&$@/(),.\-!?:]+", '', t)) fin_tok = [i for i in fin_tok if i != None and i != " " and i != ""] return fin_tok
def text_cleaner(text): text = text.lower() # приведение в lowercase stemmer = Stemmer('russian') text = ' '.join( stemmer.stemWords( text.split() ) ) text = re.sub( r'\b\d+\b', ' digit ', text ) # замена цифр return text
from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.pipeline import FeatureUnion from gensim.models import Word2Vec from sklearn.decomposition import PCA from matplotlib import pyplot import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from keras.models import Sequential from keras.layers import Dense from keras.utils import np_utils import pandas from Stemmer import Stemmer stem = Stemmer('english') stop_words = ['.', ','] stop_words = set(stopwords.words('english')) stop_words.add('.') stop_words.add('I') f = open('../../data/processed_data.txt', 'r') q = open('../../data/queries.txt', 'r') o = open('../../data/options.txt', 'r') a = open('../../data/answers.txt', 'r') tmp = open('tmp.txt', 'wa+') nouns=[] WINDOW=7 TEST_SIZE=40000 data=""
TYPES = (('gismu', 'Root words.'), ('cmavo', 'Particles.'), ('cmavo-compound', 'Particle combinations.'), ('lujvo', 'Compound words.'), ('experimental gismu', 'Non-standard root words.'), ('experimental cmavo', 'Non-standard particles.'), ("fu'ivla", 'Loan words.'), ('cmene', 'Names.'), ('cmevla', 'Names.'), ('bu-letteral', 'Letters.'), ('zei-lujvo', 'Compound words with ZEI.'), ('obsolete cmevla', 'Obsolete names.'), ('obsolete cmene', 'Obsolete names.'), ('obsolete cmavo', 'Obsolete particles.'), ("obsolete fu'ivla", 'Obsolete loan words.'), ('obsolete zei-lujvo', 'Obsolete ZEI compound words.')) stem = Stemmer('english').stemWord def load_yaml(filename): with open(filename) as f: return yaml.load(f) def tex2html(tex): """Turn most of the TeX used in jbovlaste into HTML. >>> tex2html('$x_1$ is $10^2*2$ examples of $x_{2}$.') u'x<sub>1</sub> is 10<sup>2\\xd72</sup> examples of x<sub>2</sub>.' >>> tex2html('\emph{This} is emphasised and \\\\textbf{this} is boldfaced.') u'<em>This</em> is emphasised and <strong>this</strong> is boldfaced.' """