def lemmatizer():
    # Lazy load lemmatizer because it is slow on import and first use
    global _lemmatizer
    if not _lemmatizer:
        from nltk.stem import WordNetLemmatizer
        _lemmatizer = lru_cache(maxsize=50000)(WordNetLemmatizer().lemmatize)
    return _lemmatizer
Exemple #2
0
	def __init__(self, load_doc_vec =True):
		SetupClassifier.__init__(self, load_doc_vec)
		self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize)
		self.exclude = set(stopwords.words('english'))
		logging.info('classifier ready')
		self.article = None
		self.doc_vec_norms = numpy.linalg.norm(numpy.asarray(self.doc_vec, dtype = numpy.float32), axis =1)
	def __init__(self, file_name):
		self.speeches = open(file_name, 'r').read()
		## Lemmatizer with cache
		self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize)
		self.exclude = set(stopwords.words('english'))
		self.not_processed =0
		logger.info("Instance created.")
 def __init__(self, file_name):
     self.speeches = open(file_name, 'r').read()
     ## Lemmatizer with cache
     self.lemmatize = (lru_cache(maxsize=50000))(
         WordNetLemmatizer().lemmatize)
     self.exclude = set(stopwords.words('english'))
     self.not_processed = 0
     logger.info("Instance created.")
Exemple #5
0
	def __init__(self):
		## Lemmatizer with cache
		self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize)
		self.exclude = set(stopwords.words('english'))
		self.tag_count = eval(open('count_tags.txt', 'r').read())
		self.tagset = set(self.tag_count.keys())
		self.conn = pymongo.Connection().articles.collection_1
		self.excluded_articles = 0
		logger.info("Instance created.")
Exemple #6
0
	def __init__(self, num_topics= 4):
		self.dictionary = gensim.corpora.Dictionary()
		self.lemmatize = (lru_cache(maxsize = 50000))(WordNetLemmatizer().lemmatize)
		self.exclude = set(stopwords.words('english'))
		self.num_topics = num_topics
		self.HTMLTextBlobber = usefulText.HTMLTextBlob()
		self.formatted_text = []
		self.corpus =[]
		self.url_counter =0
		self.redis = redis.StrictRedis(host='localhost', port =6379, db=5)
		logging.info('Tracker initiated...')
		self.log = open('log.txt','a')	
Exemple #7
0
 def __init__(self, num_topics=4):
     self.dictionary = gensim.corpora.Dictionary()
     self.lemmatize = (lru_cache(maxsize=50000))(
         WordNetLemmatizer().lemmatize)
     self.exclude = set(stopwords.words('english'))
     self.num_topics = num_topics
     self.HTMLTextBlobber = usefulText.HTMLTextBlob()
     self.formatted_text = []
     self.corpus = []
     self.url_counter = 0
     self.redis = redis.StrictRedis(host='localhost', port=6379, db=5)
     logging.info('Tracker initiated...')
     self.log = open('log.txt', 'a')
Exemple #8
0
 def __init__(self):
     self.lemmatize = (functools32.lru_cache(maxsize = 1000000))(nltk.stem.WordNetLemmatizer().lemmatize)
     text = open('stopwords.txt').read().split(',')
     self.stopwords = [word.strip() for word in text] # proper stopwords
def memoize(func):
    """Cache the value returned by a function call."""
    func = functools32.lru_cache()(func)
    _memoized_functions.append(func)
    return func
Exemple #10
0
# -*- coding: utf-8 -*-
"""
GitHub related domain models.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from functools32 import lru_cache

from ..lib.edx_repo_tools_data.utils import get_people as _get_people
from ..lib.exceptions import NotFoundError
from ..lib.github.models import GithubWebHookEvent

get_people = lru_cache()(_get_people)


class GithubEvent(GithubWebHookEvent):
    """
    A GitHub webhook event.

    Attributes:
        gh (github3.GitHub): An authenticated GitHub API client session
        event_type (str): GitHub event type
        event (Dict[str, Any]): The parsed event payload
    """
    def __init__(self, gh, event_type, event):
        """
        Init.

        Arguments:
Exemple #11
0
 def wrapper(fn):
     return functools32.lru_cache(maxsize=maxsize)(fn)
Exemple #12
0
import nltk
#from snowballstemmer import EnglishStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from functools32 import lru_cache

import numpy as np

path = os.path.dirname(os.path.abspath(__file__))

max_terms = 11500

#stemmer = EnglishStemmer()
lmtzr = WordNetLemmatizer()
#stemWords = lru_cache(maxsize=max_terms)(stemmer.stemWords)
lemmatize = lru_cache(maxsize=max_terms)(lmtzr.lemmatize)

# ========================================================================= #
"""
read all text documents in enron and save as pandas table.
columns of table are [file], [text], and [label].
"""
def save_enron(path='/home/cilsat/dev/nlp', form='dataframe'):
    enron_data = {}
    enron_label = {}

    e_dirs = [e for e in os.listdir(path) if e.startswith('enron')]
    for dirs in e_dirs:
        data, labels = build_token_dict(os.path.join(path, dirs), getlabels=True)
        enron_data.update(data)
        enron_label.update(labels)
Exemple #13
0
from flask import request, jsonify, make_response, Response
from flask import current_app
from appbase.flaskutils import add_cors_headers, jsonify_unsafe
from appbase.errors import BaseError, AccessDenied, NotFoundError
from appbase.errors import InvalidSessionError
import appbase.users.sessions as sessionlib
import appbase.context as context

if settings.DB_TRANSACTIONS_ENABLED:
    from appbase.pw import dbtransaction
else:
    dbtransaction = lambda f: f

SESSION_COOKIE_NAME = getattr(settings, 'SESSION_COOKIE_NAME', '__s')

cache = lru_cache()
cache_ttl = datetime.timedelta(0, (10 * 60))


def extract_kw(request):
    return (request.args and dict((k, v) for (k, v) in request.args.items())) or \
            request.json or \
            (request.data and json.loads(request.data.decode('utf-8'))) or \
            request.form or \
            {}


def flaskapi(app, f, jsonify_result=True):
    @wraps(f)
    def wrapper(*args, **kw):
        logger = current_app.logger
Exemple #14
0

# Input arguments
PROGRAM_DESCRIPTION = "Read tweets from collection"
parser = argparse.ArgumentParser(description=PROGRAM_DESCRIPTION)
parser.add_argument('collection_name', type=str, help='collection_to_read_tweets')
parser.add_argument('directory', type=str, help='directory to store')
parser.add_argument('unique_user_file', type=str, help='path to unique user list in csv')
parser.add_argument('prefix', type=str, help='used in output file name eg hashtag')

args = vars(parser.parse_args())

# initializing lemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=50000)(wordnet_lemmatizer.lemmatize)


def main():
    collection_name = args['collection_name']
    dir = args['directory']
    unique_user_file = args['unique_user_file']
    DIRECTORY = args['prefix']

    try:
        os.stat(dir)
    except:
        os.mkdir(dir)

    log_file = dir + '/' + DIRECTORY + "_log.log"
    logging.basicConfig(filename=log_file, level=logging.DEBUG, format='%(asctime)s %(message)s')
Exemple #15
0
# coding=utf-8
import codecs
import re
import cv2
import os

from utils import create_xmlfile_online
from functools32 import lru_cache

memorized = lru_cache(maxsize=10000)


class PatternMatcher(object):
    def __init__(self, pattern, _type):
        self.pattern = re.compile(pattern)
        self.type = _type

    def matchedType(self, pattern_str):
        return self.type if self.pattern.match(pattern_str) else None


class PatternGroupMatcher(object):
    def __init__(self, *args):
        for arg in args:
            assert isinstance(arg, PatternMatcher), arg
        self.patterns = args

    def find_type(self, pattern_str):
        for p in self.patterns:
            t = p.matchedType(pattern_str)
            if t:
 def __init__(self, func):
     self.func = lru_cache(maxsize=32)(func)
     self._res = None
Exemple #17
0
def memoize(func):
    """Cache the value returned by a function call."""
    func = functools32.lru_cache()(func)
    _memoized_functions.append(func)
    return func
Exemple #18
0
    def test_lru(self):
        def orig(x, y):
            return 3 * x + y

        f = functools.lru_cache(maxsize=20)(orig)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(maxsize, 20)
        self.assertEqual(currsize, 0)
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 0)

        domain = range(5)
        for i in range(1000):
            x, y = choice(domain), choice(domain)
            actual = f(x, y)
            expected = orig(x, y)
            self.assertEqual(actual, expected)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertTrue(hits > misses)
        self.assertEqual(hits + misses, 1000)
        self.assertEqual(currsize, 20)

        f.cache_clear()  # test clearing
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 0)
        self.assertEqual(currsize, 0)
        f(x, y)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 1)
        self.assertEqual(currsize, 1)

        # Test bypassing the cache
        self.assertIs(f.__wrapped__, orig)
        f.__wrapped__(x, y)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 1)
        self.assertEqual(currsize, 1)

        # test size zero (which means "never-cache")
        @functools.lru_cache(0)
        def f():
            f_cnt[0] += 1
            return 20

        self.assertEqual(f.cache_info().maxsize, 0)
        f_cnt = [0]
        for i in range(5):
            self.assertEqual(f(), 20)
        self.assertEqual(f_cnt[0], 5)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 5)
        self.assertEqual(currsize, 0)

        # test size one
        @functools.lru_cache(1)
        def f():
            f_cnt[0] += 1
            return 20

        self.assertEqual(f.cache_info().maxsize, 1)
        f_cnt = [0]
        for i in range(5):
            self.assertEqual(f(), 20)
        self.assertEqual(f_cnt[0], 1)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 4)
        self.assertEqual(misses, 1)
        self.assertEqual(currsize, 1)

        # test size two
        @functools.lru_cache(2)
        def f(x):
            f_cnt[0] += 1
            return x * 10

        self.assertEqual(f.cache_info().maxsize, 2)
        f_cnt = [0]
        for x in 7, 9, 7, 9, 7, 9, 8, 8, 8, 9, 9, 9, 8, 8, 8, 7:
            #    *  *              *                          *
            self.assertEqual(f(x), x * 10)
        self.assertEqual(f_cnt[0], 4)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 12)
        self.assertEqual(misses, 4)
        self.assertEqual(currsize, 2)
import utils

import pandas as pd


def readTaggedData():
    data = pd.read_csv("nyt-ingredients-snapshot-2015.csv", encoding="utf-8")
    data = data.fillna(method="ffill")
    data.tail(10)


"""functions below  are used for only crf tagging """

lemmatizer = WordNetLemmatizer()
from functools32 import lru_cache
cached_lemmatize_fn = lru_cache(maxsize=250000)(lemmatizer.lemmatize)

# this function is used for tagging ingredient


def parse_ingredientForCRF(ingredients):
    returnArr = []
    eachIngre = []
    _, tmpFile = tempfile.mkstemp()
    _, tmpFile2 = tempfile.mkstemp()
    with open(tmpFile, 'w') as outfile:
        outfile.write(utils.export_data(ingredients))

    tmpFilePath = "./tmp/model_file"
    modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
    aa = "crf_test   -m %s %s " % (modelFilename, tmpFile)
Exemple #20
0
# -*- coding: utf-8 -*-

from __future__ import division as _division
import sys as _sys
import numpy as _np
if _sys.version_info.major < 3:
    import functools32 as _functools
else:
    import functools as _functools

# unit in degree of latitude and longitude for each mesh level. 
_unit_lat_lv1 = _functools.lru_cache(1)(lambda: 2/3)
_unit_lon_lv1 = _functools.lru_cache(1)(lambda: 1)
_unit_lat_40000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/2)
_unit_lon_40000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/2)
_unit_lat_20000 = _functools.lru_cache(1)(lambda: _unit_lat_40000()/2)
_unit_lon_20000 = _functools.lru_cache(1)(lambda: _unit_lon_40000()/2)
_unit_lat_16000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/5)
_unit_lon_16000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/5)
_unit_lat_lv2 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/8)
_unit_lon_lv2 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/8)
_unit_lat_8000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1()/10)
_unit_lon_8000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1()/10)
_unit_lat_5000 = _functools.lru_cache(1)(lambda: _unit_lat_lv2()/2)
_unit_lon_5000 = _functools.lru_cache(1)(lambda: _unit_lon_lv2()/2)
_unit_lat_4000 = _functools.lru_cache(1)(lambda: _unit_lat_8000()/2)
_unit_lon_4000 = _functools.lru_cache(1)(lambda: _unit_lon_8000()/2)
_unit_lat_2500 = _functools.lru_cache(1)(lambda: _unit_lat_5000()/2)
_unit_lon_2500 = _functools.lru_cache(1)(lambda: _unit_lon_5000()/2)
_unit_lat_2000 = _functools.lru_cache(1)(lambda: _unit_lat_lv2()/5)
_unit_lon_2000 = _functools.lru_cache(1)(lambda: _unit_lon_lv2()/5)
Exemple #21
0
    """
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            argmap = inspect.getcallargs(func, *args, **kwargs)
            for k, map_func in six.iteritems(maps):
                if k in argmap:
                    argmap[k] = map_func(argmap[k])
            return func(**argmap)

        return wrapper

    return deco


memoized = functools.lru_cache(maxsize=None)
""" Alias to :func:`functools.lru_cache` """

_MEMOIZED_NOARGS = {}


def memoized_ignoreargs(func):
    """
    A decorator. It performs memoization ignoring the arguments used to call
    the function.
    """
    hash(func)  # make sure it is hashable. TODO is it necessary?

    def wrapper(*args, **kwargs):
        if func not in _MEMOIZED_NOARGS:
            res = func(*args, **kwargs)
Exemple #22
0
def astar_search(problem, h=None):
    h = lru_cache()(h or problem.h_sld, 'h')
    return best_first_graph_search(problem, lambda node: h(node) + node.path_cost)
Exemple #23
0
 def __enter__(self):
     self._wrapper_fn = functools32.lru_cache(maxsize=self._maxsize)(
         self._fn)
     self._overrideModuleFunctionWith(self._wrapper_fn)
     return self
Exemple #24
0
    import functools32 as _functools
else:
    import functools as _functools


def _get_num_digits(t):
    return _np.floor(_np.log10(t) + 1)


def _slice(t, start, stop):
    num_digits = _get_num_digits(t)
    return (t % 10**(num_digits - start)) // 10**(num_digits - stop)


# unit in degree of latitude and longitude for each mesh level.
_unit_lat_lv1 = _functools.lru_cache(1)(lambda: 2 / 3)
_unit_lon_lv1 = _functools.lru_cache(1)(lambda: 1)
_unit_lat_40000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 2)
_unit_lon_40000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 2)
_unit_lat_20000 = _functools.lru_cache(1)(lambda: _unit_lat_40000() / 2)
_unit_lon_20000 = _functools.lru_cache(1)(lambda: _unit_lon_40000() / 2)
_unit_lat_16000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 5)
_unit_lon_16000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 5)
_unit_lat_lv2 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 8)
_unit_lon_lv2 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 8)
_unit_lat_8000 = _functools.lru_cache(1)(lambda: _unit_lat_lv1() / 10)
_unit_lon_8000 = _functools.lru_cache(1)(lambda: _unit_lon_lv1() / 10)
_unit_lat_5000 = _functools.lru_cache(1)(lambda: _unit_lat_lv2() / 2)
_unit_lon_5000 = _functools.lru_cache(1)(lambda: _unit_lon_lv2() / 2)
_unit_lat_4000 = _functools.lru_cache(1)(lambda: _unit_lat_8000() / 2)
_unit_lon_4000 = _functools.lru_cache(1)(lambda: _unit_lon_8000() / 2)
Exemple #25
0
    # if it's an extended match, return the unextended version
    elif len(key) == 3 and key[0:2] == u'%!':
        return key, key[-1]
    # otherwise, remove it
    else:
        return key, u''

DEFAULT_TO_WRITE = dict(imap(auto_convert, FLEXIBLE_REGEX.keys()))
DEFAULT_TO_WRITE.update(HIDE_CHOICES)


# thread-safe caching

CACHE_MAX_SIZE = 100
_CACHE_LOCK = _thread_allocate_lock()
_CACHED_REGEXP = lru_cache(maxsize=CACHE_MAX_SIZE)(_to_regexp)

def to_regexp(fmt, substitutions=None):
    with _CACHE_LOCK:
        return _CACHED_REGEXP(fmt, substitutions)


# the main logic to construct a date/time from the matched data, lifted
# verbatim from the python source.  the only changes are to check that
# a group has actually matched (since now some may be optional), the
# modified handling for y50, and uzing -ve indices for z minutes.

def to_time_tuple(found_dict):
    u'''Closely based on _strptime in standard Python.'''
    year = None
    month = day = 1
Exemple #26
0
        def wrapper(*args, **kwargs):
            if six.PY2:
                argmap = inspect.getcallargs(func, *args, **kwargs)
            else:
                # getcallargs was deprecated since 3.5
                sig = inspect.signature(func)
                argmap = sig.bind_partial(*args, **kwargs).arguments
            for k, map_func in six.iteritems(maps):
                if k in argmap:
                    argmap[k] = map_func(argmap[k])
            return func(**argmap)
        return wrapper
    return deco


memoized = functools.lru_cache(maxsize=None)
""" Alias to :func:`functools.lru_cache` """


def graph_memoized(func):
    """
    Like memoized, but keep one cache per default graph.
    """
    import tensorflow as tf
    GRAPH_ARG_NAME = '__IMPOSSIBLE_NAME_FOR_YOU__'

    @memoized
    def func_with_graph_arg(*args, **kwargs):
        kwargs.pop(GRAPH_ARG_NAME)
        return func(*args, **kwargs)
    def test_lru(self):
        def orig(x, y):
            return 3*x+y
        f = functools.lru_cache(maxsize=20)(orig)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(maxsize, 20)
        self.assertEqual(currsize, 0)
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 0)

        domain = range(5)
        for i in range(1000):
            x, y = choice(domain), choice(domain)
            actual = f(x, y)
            expected = orig(x, y)
            self.assertEqual(actual, expected)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertTrue(hits > misses)
        self.assertEqual(hits + misses, 1000)
        self.assertEqual(currsize, 20)

        f.cache_clear()   # test clearing
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 0)
        self.assertEqual(currsize, 0)
        f(x, y)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 1)
        self.assertEqual(currsize, 1)

        # Test bypassing the cache
        self.assertIs(f.__wrapped__, orig)
        f.__wrapped__(x, y)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 1)
        self.assertEqual(currsize, 1)

        # test size zero (which means "never-cache")
        @functools.lru_cache(0)
        def f():
            f_cnt[0] += 1
            return 20
        self.assertEqual(f.cache_info().maxsize, 0)
        f_cnt = [0]
        for i in range(5):
            self.assertEqual(f(), 20)
        self.assertEqual(f_cnt[0], 5)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 0)
        self.assertEqual(misses, 5)
        self.assertEqual(currsize, 0)

        # test size one
        @functools.lru_cache(1)
        def f():
            f_cnt[0] += 1
            return 20
        self.assertEqual(f.cache_info().maxsize, 1)
        f_cnt = [0]
        for i in range(5):
            self.assertEqual(f(), 20)
        self.assertEqual(f_cnt[0], 1)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 4)
        self.assertEqual(misses, 1)
        self.assertEqual(currsize, 1)

        # test size two
        @functools.lru_cache(2)
        def f(x):
            f_cnt[0] += 1
            return x*10
        self.assertEqual(f.cache_info().maxsize, 2)
        f_cnt = [0]
        for x in 7, 9, 7, 9, 7, 9, 8, 8, 8, 9, 9, 9, 8, 8, 8, 7:
            #    *  *              *                          *
            self.assertEqual(f(x), x*10)
        self.assertEqual(f_cnt[0], 4)
        hits, misses, maxsize, currsize = f.cache_info()
        self.assertEqual(hits, 12)
        self.assertEqual(misses, 4)
        self.assertEqual(currsize, 2)