def main(): global TM global LABELMAP global CTMAP global GENDER_TO_PRONOUN global TOKEN_TO_GENDER cfg = CONFIG[args.config] catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') TM = catpeople['__TOKEN_MAPPER__'] TM.finalize() LABELMAP = util_catpeople.get_labelmap() CTMAP = util_catpeople.get_coarse_tagmap() GENDER_TO_PRONOUN = get_gender_to_pronoun(TM) TOKEN_TO_GENDER = get_token_to_gender(TM) if args.print_to_conll: # Print CatPeople in Conll Format partial_print_to_conll = functools.partial(print_to_conll, catpeople=catpeople) n_jobs = 4 Parallel(n_jobs=n_jobs)( delayed(partial_print_to_conll)(out_fn=out_fn, urls=urls) for (out_fn, urls) in itertools.izip((args.out_fn + str(i) for i in range( n_jobs)), split(catpeople['__URL_LIST__'], n_jobs))) return else: name = cfg._name if name.startswith(UNIGRAM): return doc_to_unigrams(cfg, catpeople) # doc_to_unigrams # --> entity_list_to_ngram_csr_mat(n=0, width=None) # --> get_ngrams_from_catpeople_entity # --> yield_ngrams # --> catpeople_sentence_iterator elif name.startswith(BIGRAM): return doc_to_bigrams(cfg, catpeople) # doc_to_unigrams # --> entity_list_to_ngram_csr_mat(n=0, width=None) # --> get_width_for_bigrams # --> entity_list_to_ngram_csr_mat(n=1, width=width) elif name.startswith(UNIVEC): return doc_to_univec(cfg, catpeople) # doc_to_univec # --> save_vec_file # --> entity_list_to_ngram_csr_mat(n=0, width=None) elif name.startswith(BIVEC): return doc_to_bivec(cfg) elif name.startswith(DSCTOK) or name.startswith(DSCSUF): return doc_to_dscfeat(cfg, catpeople) # --> entity_list_to_dscfeat_csr_mat # --> get_dscfeat_from_catpeople_entity # --> catpeople_sentence_iterator # --> yield_dsctok elif name.startswith(DSCTOKVEC): return doc_to_dsctokvec(cfg) elif name.startswith(UNISUF): return doc_to_unisuf(cfg, catpeople) else: raise NotImplementedError(name)
def setup(): ''' Load the catpeople data. ''' url_mention = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') TM = url_mention['__TOKEN_MAPPER__'] TM.finalize(catpeople_baseline_nb_config.MAX_TOK) E = url_mention['__URL_LIST__'] DF = url_mention['__DF__'] cat_folds = pkl.load(open(args.fold_fn)) cat2url = util_catpeople.load_cat2url(args.cat2url_fn) performance_aggregator = Performance_Aggregator(args=args) return (url_mention, TM, E, cat_folds, cat2url, performance_aggregator, DF)
def __init__(self, datacfg, ppcfg, expcfg): # Init Part 0 self.datacfg = datacfg self.ppcfg = ppcfg self.expcfg = expcfg with rasengan.tictoc('Init Part 1 : The Datacfg'): self.cp = DbfilenameShelf( r'%s/%s'%(uc.get_pfx(),self.datacfg.cp_fn), protocol=-1, flag='r') self.url_list = self.cp['__URL_LIST__'] self.TM = self.cp['__TOKEN_MAPPER__'] # self.TM.final must be patched to work with older # versions of TokenMapper that are in the pickle. if not hasattr(self.TM, 'final'): self.TM.final = False if self.is_malignull(): self.TM([self.expcfg.NULL_KEY]) self.bos_idx = self.TM.finalize() self.pa = Aggregator( datacfg=datacfg, ppcfg=ppcfg, expcfg=expcfg, url_list=self.url_list, TM=self.TM) self.cat_folds = pkl.load(uc.proj_open(self.datacfg.fold_fn)) self.cat2url = uc.load_cat2url(uc.proj_open(self.datacfg.cat2url_fn)) self.url_to_idx = dict((b,a) for a,b in enumerate(self.url_list)) self.scratch = {} pass with rasengan.tictoc('Init Part 2 : The PP CFG'): print 'Reading', 'catpeople_pp_%d'%args.ppcfg self.smat = io.mmread(uc.proj_open('catpeople_pp_%d'%args.ppcfg)) assert scipy.sparse.isspmatrix_coo(self.smat) if self.pp_prefix_is([UNIVEC, BIVEC, MALIGNER, DSCTOKVEC]): self.vectors = np.load(uc.proj_open('catpeople_pp_%d.vec'%args.ppcfg)) pass if self.is_malignull(): self.NULL_VEC = np.zeros((1,self.vectors.shape[1])) if self.exp_prefix_is([NBKERNEL, KERMACH, MALIGNER]): assert self.pp_prefix_is([UNIVEC, BIVEC, DSCTOKVEC]) if self.expcfg.rm_fn_word: # Internally Manipulates smat self.remove_fn_word() if self.expcfg.weight_method.endswith('/df'): self.populate_idf() return
def update_shelf(): url_mention = DbfilenameShelf(args.in_shelf, protocol=-1) TM = url_mention['__TOKEN_MAPPER__'] TM.finalize(catpeople_baseline_nb_config.MAX_TOK) E = url_mention['__URL_LIST__'] n_doc = 10000 with rasengan.tictoc('Extracting Contexts'): df_obj = TextualClueObject(E[:n_doc], url_mention, TM) df = defaultdict(int) for features in df_obj.features.itervalues(): for f in features: df[f] += 1 for f in df.keys(): df[f] = df[f] / float(n_doc) url_mention['__DF__'] = dict(df) url_mention.close() return
def setUpClass(cls): super(TestEntityDescriptors, cls).setUpClass() global TM global LABELMAP global CTMAP cls.cpfn = (util_catpeople.get_pfx() + '/catpeople_clean_segmented_context.shelf') cls.parsefn = (util_catpeople.get_pfx() + '/catpeople.parse.pkl') cls.catpeople = DbfilenameShelf(cls.cpfn, protocol=-1, flag='r') TM = cls.catpeople['__TOKEN_MAPPER__'] TM.finalize() LABELMAP = util_catpeople.get_labelmap() CTMAP = util_catpeople.get_coarse_tagmap() # Inject global variables to module's namespace. catpeople_preprocessor.TM = TM catpeople_preprocessor.LABELMAP = LABELMAP catpeople_preprocessor.CTMAP = CTMAP catpeople_preprocessor.GENDER_TO_PRONOUN = catpeople_preprocessor.get_gender_to_pronoun(TM) catpeople_preprocessor.TOKEN_TO_GENDER = catpeople_preprocessor.get_token_to_gender(TM) catpeople_preprocessor.populate_dsctok_globals() cls.testid = 1 print 'Calling setup'
import argparse import sys, os arg_parser = argparse.ArgumentParser( description='Remove junk from catpeople wikimic') arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}') arg_parser.add_argument('--MAX_CHAR_IN_SENT', default=1000, type=int) PDIR = ('/export/b15/prastog3' if os.uname()[1] == 'b15' else 'data/') arg_parser.add_argument('--in_shelf', default='%s/catpeople_wikilink_mentions.shelf' % PDIR, type=str) arg_parser.add_argument('--out_shelf', default='%s/catpeople_clean_segmented_context.shelf' % PDIR, type=str) args = arg_parser.parse_args() in_shelf = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') out_shelf = DbfilenameShelf(args.out_shelf, protocol=-1) urls = in_shelf['__URL_LIST__'] PAT_TOKENIZER = get_tokenizer() TOKEN_MAPPER = TokenMapper() MAX_CHAR_IN_SENT = args.MAX_CHAR_IN_SENT import re MIDDLE_NAME_REGEX = re.compile('[A-Z][^ ]*? [A-Z]\. [A-Z]') for url_idx, url in enumerate(urls): print >> sys.stderr, ('Done: %.3f \r' % (float(url_idx) * 100 / len(urls))), mentions = in_shelf[url] out_mentions = [] for mention in mentions:
result = """ <section> <p>You are not logged in.</p> <p> <a href="login.py">Login</a> | <a href="accounts/register.py">Register</a> </p> </section>""" try: cookie = SimpleCookie() http_cookie_header = environ.get('HTTP_COOKIE') if http_cookie_header: cookie.load(http_cookie_header) if 'sid' in cookie: sid = cookie['sid'].value session_store = DbfilenameShelf('sessions/sess_' + sid, writeback=False) if session_store.get('authenticated'): message = '' form_data = FieldStorage() username = session_store.get('username') form = """<p> Hey, %s. Sorry to see you go. </p> <p> <strong>Warning! This action is permenant.</strong> All of your scores will be lost. </p> <form action="delete_account.py" method="post"> <label for="pass1">Enter password: </label> <input type="password" name="pass1" id="pass1" placeholder="Enter password" required /> <label for="pass2">Reenter password: </label> <input type="password" name="pass2" id="pass2" placeholder="Reenter password" required />
import os from pathlib import Path import shelve from shelve import Shelf, DbfilenameShelf data = {'a': 0, 'b': 1, 'c': 'c-string'} filename = str(Path.home() / "shelf") #os.remove(filename + ".db") db = DbfilenameShelf(filename, flag='c', protocol=3, writeback=True) #db.update(data) #db.sync() print(f"shelf: {dict(db)}")
if len(form_data) != 0: try: cookie = SimpleCookie() http_cookie_header = environ.get('HTTP_COOKIE') if not http_cookie_header: sid = sha256(repr(time()).encode()).hexdigest() cookie['reset'] = sid else: cookie.load(http_cookie_header) if 'reset' not in cookie: sid = sha256(repr(time()).encode()).hexdigest() cookie['reset'] = sid else: sid = cookie['reset'].value session_store = DbfilenameShelf('../sessions/reset_' + sid, writeback=True) if session_store.get('code'): code = escape(form_data.getfirst('code', '').strip()) if code: form = """<form action="forgot.py" method="post"> <label for="code">Code: </label> <input type="number" name="code" id="code" min="0" max="99999" value="%s" required /> <label for="pass1">Enter new password: </label> <input type="password" name="pass1" id="pass1" required /> <label for="pass2">Reenter password: </label> <input type="password" name="pass2" id="pass2" required /> <input type="submit" /> </form>""" % code if session_store.get('code') == code: pass1 = escape(form_data.getfirst('pass1', '').strip()) pass2 = escape(form_data.getfirst('pass2', '').strip())
import cPickle as pkl from rasengan import groupby PFX = get_pfx() arg_parser = argparse.ArgumentParser(description='') arg_parser.add_argument('--in_shelf', default=PFX + '/catpeople_clean_segmented_context.shelf', type=str) arg_parser.add_argument('--parsefn', default=PFX + '/catpeople.parse.gz', type=str) arg_parser.add_argument('--parse_pkl', default=PFX + '/catpeople.parse.pkl', type=str) args = arg_parser.parse_args() catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r') TM = catpeople['__TOKEN_MAPPER__'] labelmap = get_labelmap() ctmap = get_coarse_tagmap() ftmap = get_fine_tagmap() f = gzip.GzipFile(fileobj=proj_open(args.parsefn)) def get(e): e = e.split('\t') return [e[1], int(e[6]), e[7], e[3], e[4]] PARSES = {} for parse in groupby(f): token, parent, labels, ctags, ftags = zip(*[get(r) for r in parse])
def _get_shelf_data(path): with closing(DbfilenameShelf(path, flag='r')) as shelf: return dict(shelf)
out_val.default_factory = None # FINALIZE out_val return out_val with rasengan.tictoc('Reading wikilinks'): # With joblib this takes only 8 minutes !! from joblib import Parallel, delayed out_val_list = Parallel(n_jobs=10)( delayed(get_mention_from_wikilink_thrift_file)(fn) for fn in range(1, args.last_f2r)) # out_val_list = [get_mention_from_wikilink_thrift_file(fn) # for fn in range(1, args.last_f2r)] with rasengan.tictoc('Shelving'): import shelve from shelve import DbfilenameShelf total_data = defaultdict(list) for out_val in out_val_list: for url in out_val: total_data[url].extend(out_val[url]) total_data.default_factory = None # FINALIZE out_val # Save the results of the processing. shelf = DbfilenameShelf(args.out_fn, protocol=-1) shelf['__URL_LIST__'] = total_data.keys() for url in shelf['__URL_LIST__']: shelf[url] = total_data[url] shelf.close() # Validation for e in POOL: assert e in total_data