def lemmatizer(): # Lazy load lemmatizer because it is slow on import and first use global _lemmatizer if not _lemmatizer: from nltk.stem import WordNetLemmatizer _lemmatizer = lru_cache(maxsize=50000)(WordNetLemmatizer().lemmatize) return _lemmatizer
def __init__(self, load_doc_vec =True): SetupClassifier.__init__(self, load_doc_vec) self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize) self.exclude = set(stopwords.words('english')) logging.info('classifier ready') self.article = None self.doc_vec_norms = numpy.linalg.norm(numpy.asarray(self.doc_vec, dtype = numpy.float32), axis =1)
def __init__(self, file_name): self.speeches = open(file_name, 'r').read() ## Lemmatizer with cache self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize) self.exclude = set(stopwords.words('english')) self.not_processed =0 logger.info("Instance created.")
def __init__(self, file_name): self.speeches = open(file_name, 'r').read() ## Lemmatizer with cache self.lemmatize = (lru_cache(maxsize=50000))( WordNetLemmatizer().lemmatize) self.exclude = set(stopwords.words('english')) self.not_processed = 0 logger.info("Instance created.")
def __init__(self): ## Lemmatizer with cache self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize) self.exclude = set(stopwords.words('english')) self.tag_count = eval(open('count_tags.txt', 'r').read()) self.tagset = set(self.tag_count.keys()) self.conn = pymongo.Connection().articles.collection_1 self.excluded_articles = 0 logger.info("Instance created.")
def __init__(self, num_topics= 4): self.dictionary = gensim.corpora.Dictionary() self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize) self.exclude = set(stopwords.words('english')) self.num_topics = num_topics self.HTMLTextBlobber = usefulText.HTMLTextBlob() self.formatted_text = [] self.corpus =[] self.url_counter =0 self.redis = redis.StrictRedis(host='localhost', port =6379, db=5) logging.info('Tracker initiated...') self.log = open('log.txt','a')
def __init__(self, num_topics=4): self.dictionary = gensim.corpora.Dictionary() self.lemmatize = (lru_cache(maxsize=50000))( WordNetLemmatizer().lemmatize) self.exclude = set(stopwords.words('english')) self.num_topics = num_topics self.HTMLTextBlobber = usefulText.HTMLTextBlob() self.formatted_text = [] self.corpus = [] self.url_counter = 0 self.redis = redis.StrictRedis(host='localhost', port=6379, db=5) logging.info('Tracker initiated...') self.log = open('log.txt', 'a')
def __init__(self): self.lemmatize = (functools32.lru_cache(maxsize = 1000000))(nltk.stem.WordNetLemmatizer().lemmatize) text = open('stopwords.txt').read().split(',') self.stopwords = [word.strip() for word in text] # proper stopwords
def memoize(func): """Cache the value returned by a function call.""" func = functools32.lru_cache()(func) _memoized_functions.append(func) return func
# -*- coding: utf-8 -*- """ GitHub related domain models. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from functools32 import lru_cache from ..lib.edx_repo_tools_data.utils import get_people as _get_people from ..lib.exceptions import NotFoundError from ..lib.github.models import GithubWebHookEvent get_people = lru_cache()(_get_people) class GithubEvent(GithubWebHookEvent): """ A GitHub webhook event. Attributes: gh (github3.GitHub): An authenticated GitHub API client session event_type (str): GitHub event type event (Dict[str, Any]): The parsed event payload """ def __init__(self, gh, event_type, event): """ Init. Arguments:
def wrapper(fn): return functools32.lru_cache(maxsize=maxsize)(fn)
import nltk #from snowballstemmer import EnglishStemmer from nltk.stem.wordnet import WordNetLemmatizer from functools32 import lru_cache import numpy as np path = os.path.dirname(os.path.abspath(__file__)) max_terms = 11500 #stemmer = EnglishStemmer() lmtzr = WordNetLemmatizer() #stemWords = lru_cache(maxsize=max_terms)(stemmer.stemWords) lemmatize = lru_cache(maxsize=max_terms)(lmtzr.lemmatize) # ========================================================================= # """ read all text documents in enron and save as pandas table. columns of table are [file], [text], and [label]. """ def save_enron(path='/home/cilsat/dev/nlp', form='dataframe'): enron_data = {} enron_label = {} e_dirs = [e for e in os.listdir(path) if e.startswith('enron')] for dirs in e_dirs: data, labels = build_token_dict(os.path.join(path, dirs), getlabels=True) enron_data.update(data) enron_label.update(labels)
from flask import request, jsonify, make_response, Response from flask import current_app from appbase.flaskutils import add_cors_headers, jsonify_unsafe from appbase.errors import BaseError, AccessDenied, NotFoundError from appbase.errors import InvalidSessionError import appbase.users.sessions as sessionlib import appbase.context as context if settings.DB_TRANSACTIONS_ENABLED: from appbase.pw import dbtransaction else: dbtransaction = lambda f: f SESSION_COOKIE_NAME = getattr(settings, 'SESSION_COOKIE_NAME', '__s') cache = lru_cache() cache_ttl = datetime.timedelta(0, (10 * 60)) def extract_kw(request): return (request.args and dict((k, v) for (k, v) in request.args.items())) or \ request.json or \ (request.data and json.loads(request.data.decode('utf-8'))) or \ request.form or \ {} def flaskapi(app, f, jsonify_result=True): @wraps(f) def wrapper(*args, **kw): logger = current_app.logger
# Input arguments PROGRAM_DESCRIPTION = "Read tweets from collection" parser = argparse.ArgumentParser(description=PROGRAM_DESCRIPTION) parser.add_argument('collection_name', type=str, help='collection_to_read_tweets') parser.add_argument('directory', type=str, help='directory to store') parser.add_argument('unique_user_file', type=str, help='path to unique user list in csv') parser.add_argument('prefix', type=str, help='used in output file name eg hashtag') args = vars(parser.parse_args()) # initializing lemmatizer stemmer = SnowballStemmer("english") wordnet_lemmatizer = WordNetLemmatizer() lemmatize = lru_cache(maxsize=50000)(wordnet_lemmatizer.lemmatize) def main(): collection_name = args['collection_name'] dir = args['directory'] unique_user_file = args['unique_user_file'] DIRECTORY = args['prefix'] try: os.stat(dir) except: os.mkdir(dir) log_file = dir + '/' + DIRECTORY + "_log.log" logging.basicConfig(filename=log_file, level=logging.DEBUG, format='%(asctime)s %(message)s')
# coding=utf-8 import codecs import re import cv2 import os from utils import create_xmlfile_online from functools32 import lru_cache memorized = lru_cache(maxsize=10000) class PatternMatcher(object): def __init__(self, pattern, _type): self.pattern = re.compile(pattern) self.type = _type def matchedType(self, pattern_str): return self.type if self.pattern.match(pattern_str) else None class PatternGroupMatcher(object): def __init__(self, *args): for arg in args: assert isinstance(arg, PatternMatcher), arg self.patterns = args def find_type(self, pattern_str): for p in self.patterns: t = p.matchedType(pattern_str) if t:
def __init__(self, func): self.func = lru_cache(maxsize=32)(func) self._res = None
def memoize(func): """Cache the value returned by a function call.""" func = functools32.lru_cache()(func) _memoized_functions.append(func) return func
def test_lru(self): def orig(x, y): return 3 * x + y f = functools.lru_cache(maxsize=20)(orig) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(maxsize, 20) self.assertEqual(currsize, 0) self.assertEqual(hits, 0) self.assertEqual(misses, 0) domain = range(5) for i in range(1000): x, y = choice(domain), choice(domain) actual = f(x, y) expected = orig(x, y) self.assertEqual(actual, expected) hits, misses, maxsize, currsize = f.cache_info() self.assertTrue(hits > misses) self.assertEqual(hits + misses, 1000) self.assertEqual(currsize, 20) f.cache_clear() # test clearing hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 0) self.assertEqual(currsize, 0) f(x, y) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 1) self.assertEqual(currsize, 1) # Test bypassing the cache self.assertIs(f.__wrapped__, orig) f.__wrapped__(x, y) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 1) self.assertEqual(currsize, 1) # test size zero (which means "never-cache") @functools.lru_cache(0) def f(): f_cnt[0] += 1 return 20 self.assertEqual(f.cache_info().maxsize, 0) f_cnt = [0] for i in range(5): self.assertEqual(f(), 20) self.assertEqual(f_cnt[0], 5) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 5) self.assertEqual(currsize, 0) # test size one @functools.lru_cache(1) def f(): f_cnt[0] += 1 return 20 self.assertEqual(f.cache_info().maxsize, 1) f_cnt = [0] for i in range(5): self.assertEqual(f(), 20) self.assertEqual(f_cnt[0], 1) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 4) self.assertEqual(misses, 1) self.assertEqual(currsize, 1) # test size two @functools.lru_cache(2) def f(x): f_cnt[0] += 1 return x * 10 self.assertEqual(f.cache_info().maxsize, 2) f_cnt = [0] for x in 7, 9, 7, 9, 7, 9, 8, 8, 8, 9, 9, 9, 8, 8, 8, 7: # * * * * self.assertEqual(f(x), x * 10) self.assertEqual(f_cnt[0], 4) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 12) self.assertEqual(misses, 4) self.assertEqual(currsize, 2)
import utils import pandas as pd def readTaggedData(): data = pd.read_csv("nyt-ingredients-snapshot-2015.csv", encoding="utf-8") data = data.fillna(method="ffill") data.tail(10) """functions below are used for only crf tagging """ lemmatizer = WordNetLemmatizer() from functools32 import lru_cache cached_lemmatize_fn = lru_cache(maxsize=250000)(lemmatizer.lemmatize) # this function is used for tagging ingredient def parse_ingredientForCRF(ingredients): returnArr = [] eachIngre = [] _, tmpFile = tempfile.mkstemp() _, tmpFile2 = tempfile.mkstemp() with open(tmpFile, 'w') as outfile: outfile.write(utils.export_data(ingredients)) tmpFilePath = "./tmp/model_file" modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath) aa = "crf_test -m %s %s " % (modelFilename, tmpFile)
# -*- coding: utf-8 -*- from __future__ import division as _division import sys as _sys import numpy as _np if _sys.version_info.major < 3: import functools32 as _functools else: import functools as _functools # unit in degree of latitude and longitude for each mesh level. _unit_lat_lv1 = _functools.lru_cache(1)(lambda: 2/3) _unit_lon_lv1 = _functools.lru_cache(1)(lambda: 1) _unit_lat_40000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/2) _unit_lon_40000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/2) _unit_lat_20000 = _functools.lru_cache(1)(lambda: _unit_lat_40000()/2) _unit_lon_20000 = _functools.lru_cache(1)(lambda: _unit_lon_40000()/2) _unit_lat_16000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/5) _unit_lon_16000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/5) _unit_lat_lv2 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/8) _unit_lon_lv2 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/8) _unit_lat_8000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/10) _unit_lon_8000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/10) _unit_lat_5000 = _functools.lru_cache(1)(lambda: _unit_lat_lv2()/2) _unit_lon_5000 = _functools.lru_cache(1)(lambda: _unit_lon_lv2()/2) _unit_lat_4000 = _functools.lru_cache(1)(lambda: _unit_lat_8000()/2) _unit_lon_4000 = _functools.lru_cache(1)(lambda: _unit_lon_8000()/2) _unit_lat_2500 = _functools.lru_cache(1)(lambda: _unit_lat_5000()/2) _unit_lon_2500 = _functools.lru_cache(1)(lambda: _unit_lon_5000()/2) _unit_lat_2000 = _functools.lru_cache(1)(lambda: _unit_lat_lv2()/5) _unit_lon_2000 = _functools.lru_cache(1)(lambda: _unit_lon_lv2()/5)
""" def deco(func): @functools.wraps(func) def wrapper(*args, **kwargs): argmap = inspect.getcallargs(func, *args, **kwargs) for k, map_func in six.iteritems(maps): if k in argmap: argmap[k] = map_func(argmap[k]) return func(**argmap) return wrapper return deco memoized = functools.lru_cache(maxsize=None) """ Alias to :func:`functools.lru_cache` """ _MEMOIZED_NOARGS = {} def memoized_ignoreargs(func): """ A decorator. It performs memoization ignoring the arguments used to call the function. """ hash(func) # make sure it is hashable. TODO is it necessary? def wrapper(*args, **kwargs): if func not in _MEMOIZED_NOARGS: res = func(*args, **kwargs)
def astar_search(problem, h=None): h = lru_cache()(h or problem.h_sld, 'h') return best_first_graph_search(problem, lambda node: h(node) + node.path_cost)
def __enter__(self): self._wrapper_fn = functools32.lru_cache(maxsize=self._maxsize)( self._fn) self._overrideModuleFunctionWith(self._wrapper_fn) return self
import functools32 as _functools else: import functools as _functools def _get_num_digits(t): return _np.floor(_np.log10(t) + 1) def _slice(t, start, stop): num_digits = _get_num_digits(t) return (t % 10**(num_digits - start)) // 10**(num_digits - stop) # unit in degree of latitude and longitude for each mesh level. _unit_lat_lv1 = _functools.lru_cache(1)(lambda: 2 / 3) _unit_lon_lv1 = _functools.lru_cache(1)(lambda: 1) _unit_lat_40000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 2) _unit_lon_40000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 2) _unit_lat_20000 = _functools.lru_cache(1)(lambda: _unit_lat_40000() / 2) _unit_lon_20000 = _functools.lru_cache(1)(lambda: _unit_lon_40000() / 2) _unit_lat_16000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 5) _unit_lon_16000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 5) _unit_lat_lv2 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 8) _unit_lon_lv2 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 8) _unit_lat_8000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 10) _unit_lon_8000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 10) _unit_lat_5000 = _functools.lru_cache(1)(lambda: _unit_lat_lv2() / 2) _unit_lon_5000 = _functools.lru_cache(1)(lambda: _unit_lon_lv2() / 2) _unit_lat_4000 = _functools.lru_cache(1)(lambda: _unit_lat_8000() / 2) _unit_lon_4000 = _functools.lru_cache(1)(lambda: _unit_lon_8000() / 2)
# if it's an extended match, return the unextended version elif len(key) == 3 and key[0:2] == u'%!': return key, key[-1] # otherwise, remove it else: return key, u'' DEFAULT_TO_WRITE = dict(imap(auto_convert, FLEXIBLE_REGEX.keys())) DEFAULT_TO_WRITE.update(HIDE_CHOICES) # thread-safe caching CACHE_MAX_SIZE = 100 _CACHE_LOCK = _thread_allocate_lock() _CACHED_REGEXP = lru_cache(maxsize=CACHE_MAX_SIZE)(_to_regexp) def to_regexp(fmt, substitutions=None): with _CACHE_LOCK: return _CACHED_REGEXP(fmt, substitutions) # the main logic to construct a date/time from the matched data, lifted # verbatim from the python source. the only changes are to check that # a group has actually matched (since now some may be optional), the # modified handling for y50, and uzing -ve indices for z minutes. def to_time_tuple(found_dict): u'''Closely based on _strptime in standard Python.''' year = None month = day = 1
def wrapper(*args, **kwargs): if six.PY2: argmap = inspect.getcallargs(func, *args, **kwargs) else: # getcallargs was deprecated since 3.5 sig = inspect.signature(func) argmap = sig.bind_partial(*args, **kwargs).arguments for k, map_func in six.iteritems(maps): if k in argmap: argmap[k] = map_func(argmap[k]) return func(**argmap) return wrapper return deco memoized = functools.lru_cache(maxsize=None) """ Alias to :func:`functools.lru_cache` """ def graph_memoized(func): """ Like memoized, but keep one cache per default graph. """ import tensorflow as tf GRAPH_ARG_NAME = '__IMPOSSIBLE_NAME_FOR_YOU__' @memoized def func_with_graph_arg(*args, **kwargs): kwargs.pop(GRAPH_ARG_NAME) return func(*args, **kwargs)
def test_lru(self): def orig(x, y): return 3*x+y f = functools.lru_cache(maxsize=20)(orig) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(maxsize, 20) self.assertEqual(currsize, 0) self.assertEqual(hits, 0) self.assertEqual(misses, 0) domain = range(5) for i in range(1000): x, y = choice(domain), choice(domain) actual = f(x, y) expected = orig(x, y) self.assertEqual(actual, expected) hits, misses, maxsize, currsize = f.cache_info() self.assertTrue(hits > misses) self.assertEqual(hits + misses, 1000) self.assertEqual(currsize, 20) f.cache_clear() # test clearing hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 0) self.assertEqual(currsize, 0) f(x, y) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 1) self.assertEqual(currsize, 1) # Test bypassing the cache self.assertIs(f.__wrapped__, orig) f.__wrapped__(x, y) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 1) self.assertEqual(currsize, 1) # test size zero (which means "never-cache") @functools.lru_cache(0) def f(): f_cnt[0] += 1 return 20 self.assertEqual(f.cache_info().maxsize, 0) f_cnt = [0] for i in range(5): self.assertEqual(f(), 20) self.assertEqual(f_cnt[0], 5) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 0) self.assertEqual(misses, 5) self.assertEqual(currsize, 0) # test size one @functools.lru_cache(1) def f(): f_cnt[0] += 1 return 20 self.assertEqual(f.cache_info().maxsize, 1) f_cnt = [0] for i in range(5): self.assertEqual(f(), 20) self.assertEqual(f_cnt[0], 1) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 4) self.assertEqual(misses, 1) self.assertEqual(currsize, 1) # test size two @functools.lru_cache(2) def f(x): f_cnt[0] += 1 return x*10 self.assertEqual(f.cache_info().maxsize, 2) f_cnt = [0] for x in 7, 9, 7, 9, 7, 9, 8, 8, 8, 9, 9, 9, 8, 8, 8, 7: # * * * * self.assertEqual(f(x), x*10) self.assertEqual(f_cnt[0], 4) hits, misses, maxsize, currsize = f.cache_info() self.assertEqual(hits, 12) self.assertEqual(misses, 4) self.assertEqual(currsize, 2)