Beispiel #1
0
def main():
    global TM
    global LABELMAP
    global CTMAP
    global GENDER_TO_PRONOUN
    global TOKEN_TO_GENDER
    cfg = CONFIG[args.config]
    catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
    TM = catpeople['__TOKEN_MAPPER__']
    TM.finalize()
    LABELMAP = util_catpeople.get_labelmap()
    CTMAP = util_catpeople.get_coarse_tagmap()
    GENDER_TO_PRONOUN = get_gender_to_pronoun(TM)
    TOKEN_TO_GENDER = get_token_to_gender(TM)
    if args.print_to_conll:
        # Print CatPeople in Conll Format
        partial_print_to_conll = functools.partial(print_to_conll,
                                                   catpeople=catpeople)
        n_jobs = 4
        Parallel(n_jobs=n_jobs)(
            delayed(partial_print_to_conll)(out_fn=out_fn, urls=urls)
            for (out_fn,
                 urls) in itertools.izip((args.out_fn + str(i) for i in range(
                     n_jobs)), split(catpeople['__URL_LIST__'], n_jobs)))
        return
    else:
        name = cfg._name
        if name.startswith(UNIGRAM):
            return doc_to_unigrams(cfg, catpeople)
            # doc_to_unigrams
            # --> entity_list_to_ngram_csr_mat(n=0, width=None)
            #     --> get_ngrams_from_catpeople_entity
            #         --> yield_ngrams
            #         --> catpeople_sentence_iterator
        elif name.startswith(BIGRAM):
            return doc_to_bigrams(cfg, catpeople)
            # doc_to_unigrams
            # --> entity_list_to_ngram_csr_mat(n=0, width=None)
            # --> get_width_for_bigrams
            # --> entity_list_to_ngram_csr_mat(n=1, width=width)
        elif name.startswith(UNIVEC):
            return doc_to_univec(cfg, catpeople)
            # doc_to_univec
            # --> save_vec_file
            # --> entity_list_to_ngram_csr_mat(n=0, width=None)
        elif name.startswith(BIVEC):
            return doc_to_bivec(cfg)
        elif name.startswith(DSCTOK) or name.startswith(DSCSUF):
            return doc_to_dscfeat(cfg, catpeople)
            # --> entity_list_to_dscfeat_csr_mat
            #     --> get_dscfeat_from_catpeople_entity
            #         --> catpeople_sentence_iterator
            #         --> yield_dsctok
        elif name.startswith(DSCTOKVEC):
            return doc_to_dsctokvec(cfg)
        elif name.startswith(UNISUF):
            return doc_to_unisuf(cfg, catpeople)
        else:
            raise NotImplementedError(name)
Beispiel #2
0
def setup():
    ''' Load the catpeople data.
    '''
    url_mention = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
    TM = url_mention['__TOKEN_MAPPER__']
    TM.finalize(catpeople_baseline_nb_config.MAX_TOK)
    E = url_mention['__URL_LIST__']
    DF = url_mention['__DF__']
    cat_folds = pkl.load(open(args.fold_fn))
    cat2url = util_catpeople.load_cat2url(args.cat2url_fn)
    performance_aggregator = Performance_Aggregator(args=args)
    return (url_mention, TM, E, cat_folds, cat2url, performance_aggregator, DF)
Beispiel #3
0
    def __init__(self, datacfg, ppcfg, expcfg):
        # Init Part 0
        self.datacfg = datacfg
        self.ppcfg = ppcfg
        self.expcfg = expcfg

        with rasengan.tictoc('Init Part 1 : The Datacfg'):
            self.cp = DbfilenameShelf(
                r'%s/%s'%(uc.get_pfx(),self.datacfg.cp_fn),
                protocol=-1,
                flag='r')
            self.url_list = self.cp['__URL_LIST__']
            self.TM = self.cp['__TOKEN_MAPPER__']
            # self.TM.final must be patched to work with older
            # versions of TokenMapper that are in the pickle.
            if not hasattr(self.TM, 'final'):
                self.TM.final = False
            if self.is_malignull():
                self.TM([self.expcfg.NULL_KEY])
            self.bos_idx = self.TM.finalize()
            self.pa = Aggregator(
                datacfg=datacfg,
                ppcfg=ppcfg,
                expcfg=expcfg,
                url_list=self.url_list,
                TM=self.TM)
            self.cat_folds = pkl.load(uc.proj_open(self.datacfg.fold_fn))
            self.cat2url = uc.load_cat2url(uc.proj_open(self.datacfg.cat2url_fn))
            self.url_to_idx = dict((b,a) for a,b in enumerate(self.url_list))
            self.scratch = {}
            pass

        with rasengan.tictoc('Init Part 2 : The PP CFG'):
            print 'Reading', 'catpeople_pp_%d'%args.ppcfg
            self.smat = io.mmread(uc.proj_open('catpeople_pp_%d'%args.ppcfg))
            assert scipy.sparse.isspmatrix_coo(self.smat)
            if self.pp_prefix_is([UNIVEC, BIVEC, MALIGNER, DSCTOKVEC]):
                self.vectors = np.load(uc.proj_open('catpeople_pp_%d.vec'%args.ppcfg))
            pass

        if self.is_malignull():
            self.NULL_VEC = np.zeros((1,self.vectors.shape[1]))
        if self.exp_prefix_is([NBKERNEL, KERMACH, MALIGNER]):
            assert self.pp_prefix_is([UNIVEC, BIVEC, DSCTOKVEC])
        if self.expcfg.rm_fn_word:
            # Internally Manipulates smat
            self.remove_fn_word()
        if self.expcfg.weight_method.endswith('/df'):
            self.populate_idf()
        return
Beispiel #4
0
def update_shelf():
    url_mention = DbfilenameShelf(args.in_shelf, protocol=-1)
    TM = url_mention['__TOKEN_MAPPER__']
    TM.finalize(catpeople_baseline_nb_config.MAX_TOK)
    E = url_mention['__URL_LIST__']
    n_doc = 10000
    with rasengan.tictoc('Extracting Contexts'):
        df_obj = TextualClueObject(E[:n_doc], url_mention, TM)
    df = defaultdict(int)
    for features in df_obj.features.itervalues():
        for f in features:
            df[f] += 1
    for f in df.keys():
        df[f] = df[f] / float(n_doc)
    url_mention['__DF__'] = dict(df)
    url_mention.close()
    return
Beispiel #5
0
 def setUpClass(cls):
     super(TestEntityDescriptors, cls).setUpClass()
     global TM
     global LABELMAP
     global CTMAP
     cls.cpfn = (util_catpeople.get_pfx() + '/catpeople_clean_segmented_context.shelf')
     cls.parsefn = (util_catpeople.get_pfx() + '/catpeople.parse.pkl')
     cls.catpeople = DbfilenameShelf(cls.cpfn, protocol=-1, flag='r')
     TM = cls.catpeople['__TOKEN_MAPPER__']
     TM.finalize()
     LABELMAP = util_catpeople.get_labelmap()
     CTMAP = util_catpeople.get_coarse_tagmap()
     # Inject global variables to module's namespace.
     catpeople_preprocessor.TM = TM
     catpeople_preprocessor.LABELMAP = LABELMAP
     catpeople_preprocessor.CTMAP = CTMAP
     catpeople_preprocessor.GENDER_TO_PRONOUN = catpeople_preprocessor.get_gender_to_pronoun(TM)
     catpeople_preprocessor.TOKEN_TO_GENDER = catpeople_preprocessor.get_token_to_gender(TM)
     catpeople_preprocessor.populate_dsctok_globals()
     cls.testid = 1
     print 'Calling setup'
Beispiel #6
0
import argparse
import sys, os
arg_parser = argparse.ArgumentParser(
    description='Remove junk from catpeople wikimic')
arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}')
arg_parser.add_argument('--MAX_CHAR_IN_SENT', default=1000, type=int)
PDIR = ('/export/b15/prastog3' if os.uname()[1] == 'b15' else 'data/')
arg_parser.add_argument('--in_shelf',
                        default='%s/catpeople_wikilink_mentions.shelf' % PDIR,
                        type=str)
arg_parser.add_argument('--out_shelf',
                        default='%s/catpeople_clean_segmented_context.shelf' %
                        PDIR,
                        type=str)
args = arg_parser.parse_args()
in_shelf = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
out_shelf = DbfilenameShelf(args.out_shelf, protocol=-1)
urls = in_shelf['__URL_LIST__']

PAT_TOKENIZER = get_tokenizer()
TOKEN_MAPPER = TokenMapper()
MAX_CHAR_IN_SENT = args.MAX_CHAR_IN_SENT
import re
MIDDLE_NAME_REGEX = re.compile('[A-Z][^ ]*? [A-Z]\. [A-Z]')
for url_idx, url in enumerate(urls):
    print >> sys.stderr, ('Done: %.3f \r' %
                          (float(url_idx) * 100 / len(urls))),
    mentions = in_shelf[url]

    out_mentions = []
    for mention in mentions:
Beispiel #7
0
result = """
   <section>
       <p>You are not logged in.</p>
       <p>
           <a href="login.py">Login</a> &vert; <a href="accounts/register.py">Register</a>
       </p>
   </section>"""

try:
    cookie = SimpleCookie()
    http_cookie_header = environ.get('HTTP_COOKIE')
    if http_cookie_header:
        cookie.load(http_cookie_header)
        if 'sid' in cookie:
            sid = cookie['sid'].value
            session_store = DbfilenameShelf('sessions/sess_' + sid,
                                            writeback=False)
            if session_store.get('authenticated'):
                message = ''
                form_data = FieldStorage()
                username = session_store.get('username')
                form = """<p>
                    Hey, %s. Sorry to see you go.
                </p>
                <p>
                    <strong>Warning! This action is permenant.</strong> All of your scores will be lost.
                </p>
                <form action="delete_account.py" method="post">
                    <label for="pass1">Enter password: </label>
                    <input type="password" name="pass1" id="pass1" placeholder="Enter password" required />
                    <label for="pass2">Reenter password: </label>
                    <input type="password" name="pass2" id="pass2" placeholder="Reenter password" required />
Beispiel #8
0
import os
from pathlib import Path
import shelve
from shelve import Shelf, DbfilenameShelf

data = {'a': 0, 'b': 1, 'c': 'c-string'}

filename = str(Path.home() / "shelf")
#os.remove(filename + ".db")
db = DbfilenameShelf(filename, flag='c', protocol=3, writeback=True)
#db.update(data)
#db.sync()
print(f"shelf: {dict(db)}")
Beispiel #9
0
if len(form_data) != 0:
    try:
        cookie = SimpleCookie()
        http_cookie_header = environ.get('HTTP_COOKIE')
        if not http_cookie_header:
            sid = sha256(repr(time()).encode()).hexdigest()
            cookie['reset'] = sid
        else:
            cookie.load(http_cookie_header)
            if 'reset' not in cookie:
                sid = sha256(repr(time()).encode()).hexdigest()
                cookie['reset'] = sid
            else:
                sid = cookie['reset'].value
        session_store = DbfilenameShelf('../sessions/reset_' + sid,
                                        writeback=True)
        if session_store.get('code'):
            code = escape(form_data.getfirst('code', '').strip())
            if code:
                form = """<form action="forgot.py" method="post">
                        <label for="code">Code: </label>
                        <input type="number" name="code" id="code" min="0" max="99999" value="%s" required />
                        <label for="pass1">Enter new password: </label>
                        <input type="password" name="pass1" id="pass1" required />
                        <label for="pass2">Reenter password: </label>
                        <input type="password" name="pass2" id="pass2" required />
                        <input type="submit" />
                    </form>""" % code
                if session_store.get('code') == code:
                    pass1 = escape(form_data.getfirst('pass1', '').strip())
                    pass2 = escape(form_data.getfirst('pass2', '').strip())
import cPickle as pkl
from rasengan import groupby
PFX = get_pfx()
arg_parser = argparse.ArgumentParser(description='')
arg_parser.add_argument('--in_shelf',
                        default=PFX +
                        '/catpeople_clean_segmented_context.shelf',
                        type=str)
arg_parser.add_argument('--parsefn',
                        default=PFX + '/catpeople.parse.gz',
                        type=str)
arg_parser.add_argument('--parse_pkl',
                        default=PFX + '/catpeople.parse.pkl',
                        type=str)
args = arg_parser.parse_args()
catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
TM = catpeople['__TOKEN_MAPPER__']
labelmap = get_labelmap()
ctmap = get_coarse_tagmap()
ftmap = get_fine_tagmap()
f = gzip.GzipFile(fileobj=proj_open(args.parsefn))


def get(e):
    e = e.split('\t')
    return [e[1], int(e[6]), e[7], e[3], e[4]]


PARSES = {}
for parse in groupby(f):
    token, parent, labels, ctags, ftags = zip(*[get(r) for r in parse])
Beispiel #11
0
def _get_shelf_data(path):
    with closing(DbfilenameShelf(path, flag='r')) as shelf:
        return dict(shelf)
    out_val.default_factory = None  # FINALIZE out_val
    return out_val


with rasengan.tictoc('Reading wikilinks'):
    # With joblib this takes only 8 minutes !!
    from joblib import Parallel, delayed
    out_val_list = Parallel(n_jobs=10)(
        delayed(get_mention_from_wikilink_thrift_file)(fn)
        for fn in range(1, args.last_f2r))
    # out_val_list = [get_mention_from_wikilink_thrift_file(fn)
    #                 for fn in range(1, args.last_f2r)]

with rasengan.tictoc('Shelving'):
    import shelve
    from shelve import DbfilenameShelf
    total_data = defaultdict(list)
    for out_val in out_val_list:
        for url in out_val:
            total_data[url].extend(out_val[url])
    total_data.default_factory = None  # FINALIZE out_val
    # Save the results of the processing.
    shelf = DbfilenameShelf(args.out_fn, protocol=-1)
    shelf['__URL_LIST__'] = total_data.keys()
    for url in shelf['__URL_LIST__']:
        shelf[url] = total_data[url]
    shelf.close()
    # Validation
    for e in POOL:
        assert e in total_data