Beispiel #1
0
def read():
	db = DataBase('not_bsd.dat')
	a = db['a']
	b = db['b']
	db.close()

	print('a', a)
	print('b', b)
Beispiel #2
0
    def __init__(self,
                 filename,
                 flag,
                 key_type='str',
                 dump_method='json',
                 cached=True,
                 writeback=False):

        DbfilenameShelf.__init__(self, filename, flag, -1, writeback)
        cached = (flag is 'r') and cached
        self._setup_methods(cached, key_type, dump_method)
Beispiel #3
0
    def load_workers(self, filename):
        """
        Return a list of workers read from disk as [(id, started, assetid),...].

        """
        shelf = DbfilenameShelf(filename)
        try:
            workers = shelf['workers']
        except:
            workers = []
        shelf.close()
        return workers
Beispiel #4
0
    def dump_workers(self, filename, workers):
        """
        Write a sequence of workers written to disk, e.g.
        [(id, started, assetid),...], and then return the sequence.

        """
        seq = []
        for w in workers:
            seq.append((w['worker'], w['started'], w['args'][1]))
        shelf = DbfilenameShelf(filename)
        shelf['workers'] = seq 
        shelf.close()
        return seq
Beispiel #5
0
 def __init__(self, fname=None, tmpdir=None, persistent=False):
     if fname is None:
         #get tmp directory
         self._tmpdir = TMPDIR if tmpdir is None else tmpdir
         #create tmp file for db
         fd, self.filename = mkstemp('', PREFIX, dir=self._tmpdir)
         os.close(fd); os.unlink(self.filename)
         #create a shelf in the db
         DbfilenameShelf.__init__(self, self.filename, flag='n', protocol=-1)
         if not persistent: register_tmp_file(self.filename)
     else:
         self._tmpdir  = os.path.dirname(fname)
         self.filename = fname
         DbfilenameShelf.__init__(self, self.filename, flag='w', protocol=-1)
def get_common_words(text_storage: shelve.DbfilenameShelf,
                     amount_of_common_words: int) -> [(str, int)]:
    stop_words = set()  # To hold the set of stop words
    word_frequencies = defaultdict(
        int)  # To hold the amount of occurrences for non-stop words

    # Enchant setup adapted from their tutorial: https://pyenchant.github.io/pyenchant/tutorial.html
    dictionary = enchant.Dict("en_US")  # To validate words

    # Open up the stop_words file and read in the set of stop words
    with open("stop_words.txt", "r") as file_input_stream:
        for next_word in file_input_stream:
            stop_words.add(next_word.rstrip())

    # Loop through the text for each webpage
    for next_webpage_text in text_storage.values():

        # Split the webpage according to all whitespace, dashes, and hyphens
        for next_word in re.split(r"[\s\-–]", next_webpage_text):

            # Remove special characters from the words (if any)
            next_word = re.sub(r"[.,?:!;()\[\]{}\"]", "", next_word)

            # If the next word contains only alphabetical characters (and some special characters),
            # is a recognizable English word, and is not a stop word, increment its frequency
            if (re.match(r"^[a-zA-Z']+$", next_word) is not None) and \
               dictionary.check(next_word) and \
               (next_word.lower() not in stop_words):
                word_frequencies[next_word.lower()] += 1

    # Sort the words according to their frequency in descending order and return them
    words_in_descending_frequency = \
        [(next_word, frequency) for next_word, frequency in sorted(word_frequencies.items(), key=lambda x: (-x[1]))]

    return words_in_descending_frequency[:amount_of_common_words]
Beispiel #7
0
def loadfile(
    site: site.Site, src: ContentSrc, bin: bool, filecache: shelve.DbfilenameShelf
) -> List[Tuple[ContentSrc, Optional[bytes]]]:

    curstat = src.stat()

    key = f"{src.package}_::::_{src.srcpath}"

    stat, bodies = filecache.get(key, (None, None))
    if stat:
        if stat == curstat:
            return cast(List[Tuple[ContentSrc, Optional[bytes]]], bodies)

    if not bin:
        assert src.srcpath
        ext = os.path.splitext(src.srcpath)[1]
        loader = FILELOADERS.get(ext, binloader)
    else:
        loader = binloader

    ret: List[Tuple[ContentSrc, Optional[bytes]]] = []
    for contentsrc, body in loader(site, src):
        assert contentsrc.metadata["loader"]

        if isinstance(body, bytes):
            ret.append((contentsrc, body))
        if isinstance(body, str):
            ret.append((contentsrc, body.encode("utf-8")))
        else:
            ret.append((contentsrc, None))

    filecache[key] = curstat, ret
    return ret
Beispiel #8
0
def main():
    global TM
    global LABELMAP
    global CTMAP
    global GENDER_TO_PRONOUN
    global TOKEN_TO_GENDER
    cfg = CONFIG[args.config]
    catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
    TM = catpeople['__TOKEN_MAPPER__']
    TM.finalize()
    LABELMAP = util_catpeople.get_labelmap()
    CTMAP = util_catpeople.get_coarse_tagmap()
    GENDER_TO_PRONOUN = get_gender_to_pronoun(TM)
    TOKEN_TO_GENDER = get_token_to_gender(TM)
    if args.print_to_conll:
        # Print CatPeople in Conll Format
        partial_print_to_conll = functools.partial(print_to_conll,
                                                   catpeople=catpeople)
        n_jobs = 4
        Parallel(n_jobs=n_jobs)(
            delayed(partial_print_to_conll)(out_fn=out_fn, urls=urls)
            for (out_fn,
                 urls) in itertools.izip((args.out_fn + str(i) for i in range(
                     n_jobs)), split(catpeople['__URL_LIST__'], n_jobs)))
        return
    else:
        name = cfg._name
        if name.startswith(UNIGRAM):
            return doc_to_unigrams(cfg, catpeople)
            # doc_to_unigrams
            # --> entity_list_to_ngram_csr_mat(n=0, width=None)
            #     --> get_ngrams_from_catpeople_entity
            #         --> yield_ngrams
            #         --> catpeople_sentence_iterator
        elif name.startswith(BIGRAM):
            return doc_to_bigrams(cfg, catpeople)
            # doc_to_unigrams
            # --> entity_list_to_ngram_csr_mat(n=0, width=None)
            # --> get_width_for_bigrams
            # --> entity_list_to_ngram_csr_mat(n=1, width=width)
        elif name.startswith(UNIVEC):
            return doc_to_univec(cfg, catpeople)
            # doc_to_univec
            # --> save_vec_file
            # --> entity_list_to_ngram_csr_mat(n=0, width=None)
        elif name.startswith(BIVEC):
            return doc_to_bivec(cfg)
        elif name.startswith(DSCTOK) or name.startswith(DSCSUF):
            return doc_to_dscfeat(cfg, catpeople)
            # --> entity_list_to_dscfeat_csr_mat
            #     --> get_dscfeat_from_catpeople_entity
            #         --> catpeople_sentence_iterator
            #         --> yield_dsctok
        elif name.startswith(DSCTOKVEC):
            return doc_to_dsctokvec(cfg)
        elif name.startswith(UNISUF):
            return doc_to_unisuf(cfg, catpeople)
        else:
            raise NotImplementedError(name)
Beispiel #9
0
 def __init__(self, name, logger):
     filename = os.environ['HOME'] + '/.lox/.' + name + '.cache'
     DbfilenameShelf.__init__(self, filename, protocol=2, writeback=True)
     api = LoxApi(name)
     api_version = api.version()
     config_dir = config.settings[name]['local_dir']
     try:
         my_dir = self.get('local_dir',None)
         assert config_dir == my_dir
         my_version = self.get('version',None)
         assert api_version == my_version
     except AssertionError:
         # Cache is considered not safe, so re-initialized
         logger.warn("Initializing cache")
         self.clear()
         self[u'local_dir'] = config_dir
         self[u'version'] = api_version
Beispiel #10
0
def update_shelf():
    url_mention = DbfilenameShelf(args.in_shelf, protocol=-1)
    TM = url_mention['__TOKEN_MAPPER__']
    TM.finalize(catpeople_baseline_nb_config.MAX_TOK)
    E = url_mention['__URL_LIST__']
    n_doc = 10000
    with rasengan.tictoc('Extracting Contexts'):
        df_obj = TextualClueObject(E[:n_doc], url_mention, TM)
    df = defaultdict(int)
    for features in df_obj.features.itervalues():
        for f in features:
            df[f] += 1
    for f in df.keys():
        df[f] = df[f] / float(n_doc)
    url_mention['__DF__'] = dict(df)
    url_mention.close()
    return
Beispiel #11
0
 def __init__(self, args_sequence: Sequence, as_input_cache=False):
     Cache.__init__(self, args_sequence)
     try:
         if as_input_cache:
             # noinspection PyUnresolvedReferences
             DbfilenameShelf.__init__(self,
                                      filename=self.filename,
                                      flag='r')
         else:
             # noinspection PyUnresolvedReferences
             DbfilenameShelf.__init__(self,
                                      filename=self.filename,
                                      flag=self.FILE_CACHE_MODE[self.mode],
                                      writeback=self.in_memory_cache)
     except IOError as e:
         self._critical('Error while opening cache', ExitCode.FILE_ERROR, e)
     # noinspection PyUnresolvedReferences
     self._debug(f' Initialized', header=f'Cache "{self.filename}"')
Beispiel #12
0
def get_friends(cache: DbfilenameShelf = None, name: str = SHLOMO):
    if cache is not None:
        friends = cache.get(name)
        if friends is not None:
            return cache[name]
    response = get_first_request(name)
    first_page_singers = response[0]
    parallel_searcher = response[1]
    pages_range = response[2]
    if not pages_range:  # only 1 page
        friends = first_page_singers
    else:
        with TPool(INNER_MAX_WORKERS) as pool:
            next_pages_singers = pool.map(parallel_searcher, pages_range)
        friends = first_page_singers.union(*next_pages_singers)
    if cache is not None:
        cache[name] = friends
        cache.sync()
    return friends
Beispiel #13
0
 def __init__(self, fname=None, tmpdir=None, persistent=False):
     if fname is None:
         #get tmp directory
         self._tmpdir = TMPDIR if tmpdir is None else tmpdir
         #create tmp file for db
         fd, self.filename = mkstemp('', PREFIX, dir=self._tmpdir)
         os.close(fd)
         os.unlink(self.filename)
         #create a shelf in the db
         DbfilenameShelf.__init__(self,
                                  self.filename,
                                  flag='n',
                                  protocol=-1)
         if not persistent: register_tmp_file(self.filename)
     else:
         self._tmpdir = os.path.dirname(fname)
         self.filename = fname
         DbfilenameShelf.__init__(self,
                                  self.filename,
                                  flag='w',
                                  protocol=-1)
Beispiel #14
0
def setup():
    ''' Load the catpeople data.
    '''
    url_mention = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
    TM = url_mention['__TOKEN_MAPPER__']
    TM.finalize(catpeople_baseline_nb_config.MAX_TOK)
    E = url_mention['__URL_LIST__']
    DF = url_mention['__DF__']
    cat_folds = pkl.load(open(args.fold_fn))
    cat2url = util_catpeople.load_cat2url(args.cat2url_fn)
    performance_aggregator = Performance_Aggregator(args=args)
    return (url_mention, TM, E, cat_folds, cat2url, performance_aggregator, DF)
Beispiel #15
0
    def __init__(self, datacfg, ppcfg, expcfg):
        # Init Part 0
        self.datacfg = datacfg
        self.ppcfg = ppcfg
        self.expcfg = expcfg

        with rasengan.tictoc('Init Part 1 : The Datacfg'):
            self.cp = DbfilenameShelf(
                r'%s/%s'%(uc.get_pfx(),self.datacfg.cp_fn),
                protocol=-1,
                flag='r')
            self.url_list = self.cp['__URL_LIST__']
            self.TM = self.cp['__TOKEN_MAPPER__']
            # self.TM.final must be patched to work with older
            # versions of TokenMapper that are in the pickle.
            if not hasattr(self.TM, 'final'):
                self.TM.final = False
            if self.is_malignull():
                self.TM([self.expcfg.NULL_KEY])
            self.bos_idx = self.TM.finalize()
            self.pa = Aggregator(
                datacfg=datacfg,
                ppcfg=ppcfg,
                expcfg=expcfg,
                url_list=self.url_list,
                TM=self.TM)
            self.cat_folds = pkl.load(uc.proj_open(self.datacfg.fold_fn))
            self.cat2url = uc.load_cat2url(uc.proj_open(self.datacfg.cat2url_fn))
            self.url_to_idx = dict((b,a) for a,b in enumerate(self.url_list))
            self.scratch = {}
            pass

        with rasengan.tictoc('Init Part 2 : The PP CFG'):
            print 'Reading', 'catpeople_pp_%d'%args.ppcfg
            self.smat = io.mmread(uc.proj_open('catpeople_pp_%d'%args.ppcfg))
            assert scipy.sparse.isspmatrix_coo(self.smat)
            if self.pp_prefix_is([UNIVEC, BIVEC, MALIGNER, DSCTOKVEC]):
                self.vectors = np.load(uc.proj_open('catpeople_pp_%d.vec'%args.ppcfg))
            pass

        if self.is_malignull():
            self.NULL_VEC = np.zeros((1,self.vectors.shape[1]))
        if self.exp_prefix_is([NBKERNEL, KERMACH, MALIGNER]):
            assert self.pp_prefix_is([UNIVEC, BIVEC, DSCTOKVEC])
        if self.expcfg.rm_fn_word:
            # Internally Manipulates smat
            self.remove_fn_word()
        if self.expcfg.weight_method.endswith('/df'):
            self.populate_idf()
        return
Beispiel #16
0
 def setUpClass(cls):
     super(TestEntityDescriptors, cls).setUpClass()
     global TM
     global LABELMAP
     global CTMAP
     cls.cpfn = (util_catpeople.get_pfx() + '/catpeople_clean_segmented_context.shelf')
     cls.parsefn = (util_catpeople.get_pfx() + '/catpeople.parse.pkl')
     cls.catpeople = DbfilenameShelf(cls.cpfn, protocol=-1, flag='r')
     TM = cls.catpeople['__TOKEN_MAPPER__']
     TM.finalize()
     LABELMAP = util_catpeople.get_labelmap()
     CTMAP = util_catpeople.get_coarse_tagmap()
     # Inject global variables to module's namespace.
     catpeople_preprocessor.TM = TM
     catpeople_preprocessor.LABELMAP = LABELMAP
     catpeople_preprocessor.CTMAP = CTMAP
     catpeople_preprocessor.GENDER_TO_PRONOUN = catpeople_preprocessor.get_gender_to_pronoun(TM)
     catpeople_preprocessor.TOKEN_TO_GENDER = catpeople_preprocessor.get_token_to_gender(TM)
     catpeople_preprocessor.populate_dsctok_globals()
     cls.testid = 1
     print 'Calling setup'
def get_longest_page(shelf: shelve.DbfilenameShelf) -> (str, int):
    # To hold the longest page and its number of words
    longest_page = ""
    longest_page_count = 0

    # Get all urls from the shelve and loop through them
    keys = shelf.keys()
    for key in keys:

        # Get the words for the current url and count its words
        words = re.split(r"[\s\-–]", shelf[key])
        word_count = len(words)

        # If the amount of words for this url is more than the previous longest page, update the longest page and
        # word amount
        if word_count > longest_page_count:
            longest_page_count = word_count
            longest_page = key

    # Return the longest page and its word amount as a 2-tuple
    return longest_page, longest_page_count
Beispiel #18
0
def _get_shelf_data(path):
    with closing(DbfilenameShelf(path, flag='r')) as shelf:
        return dict(shelf)
Beispiel #19
0
 def get(self,name,default=None):
     key = name.encode('utf8')
     if DbfilenameShelf.has_key(self,key):
         return DbfilenameShelf.__getitem__(self, key)
     else:
         return default
Beispiel #20
0
 def __setitem__(self, name, value):
     key = name.encode('utf8')
     DbfilenameShelf.__setitem__(self, key, value)
     self.sync()
Beispiel #21
0
if len(form_data) != 0:
    try:
        cookie = SimpleCookie()
        http_cookie_header = environ.get('HTTP_COOKIE')
        if not http_cookie_header:
            sid = sha256(repr(time()).encode()).hexdigest()
            cookie['reset'] = sid
        else:
            cookie.load(http_cookie_header)
            if 'reset' not in cookie:
                sid = sha256(repr(time()).encode()).hexdigest()
                cookie['reset'] = sid
            else:
                sid = cookie['reset'].value
        session_store = DbfilenameShelf('../sessions/reset_' + sid,
                                        writeback=True)
        if session_store.get('code'):
            code = escape(form_data.getfirst('code', '').strip())
            if code:
                form = """<form action="forgot.py" method="post">
                        <label for="code">Code: </label>
                        <input type="number" name="code" id="code" min="0" max="99999" value="%s" required />
                        <label for="pass1">Enter new password: </label>
                        <input type="password" name="pass1" id="pass1" required />
                        <label for="pass2">Reenter password: </label>
                        <input type="password" name="pass2" id="pass2" required />
                        <input type="submit" />
                    </form>""" % code
                if session_store.get('code') == code:
                    pass1 = escape(form_data.getfirst('pass1', '').strip())
                    pass2 = escape(form_data.getfirst('pass2', '').strip())
Beispiel #22
0
    def __init__(self, filename, flag, key_type='str', dump_method='json',
                 cached=True, writeback=False):

        DbfilenameShelf.__init__(self, filename, flag, -1, writeback)
        cached = (flag is 'r') and cached
        self._setup_methods(cached, key_type, dump_method)
Beispiel #23
0
import os
from pathlib import Path
import shelve
from shelve import Shelf, DbfilenameShelf

data = {'a': 0, 'b': 1, 'c': 'c-string'}

filename = str(Path.home() / "shelf")
#os.remove(filename + ".db")
db = DbfilenameShelf(filename, flag='c', protocol=3, writeback=True)
#db.update(data)
#db.sync()
print(f"shelf: {dict(db)}")
Beispiel #24
0
 def __delitem__(self, key):
     DbfilenameShelf.__delitem__(self, key)
     self.sync_now()
Beispiel #25
0
 def __init__(self, filename):
     DbfilenameShelf.__init__(self, filename)
     self.filename = filename
Beispiel #26
0
 def __init__(self, *args, **kwargs):
     DbfilenameShelf.__init__(self, *args, **kwargs)
Beispiel #27
0
 def __init__(self, filename):
     self.filename = filename
     DbfilenameShelf.__init__(self, filename, flag='r', protocol=-1)
Beispiel #28
0
 def __init__(self, filename):
     self.filename = filename
     DbfilenameShelf.__init__(self, filename, flag='r', protocol=-1)
Beispiel #29
0
 </form>
 <p><a href="accounts/forgot.py">Forgot password</a></p>""" % username
 sha256_password = sha256(password.encode()).hexdigest()
 try:
     connection = db.connect('localhost', 'cf26', 'pecah', 'cs6503_cs1106_cf26')
     cursor = connection.cursor(db.cursors.DictCursor)
     cursor.execute("""SELECT * FROM users
                       WHERE username = %s
                       AND password = %s""", (username, sha256_password))
     if cursor.rowcount == 0:
         message = '<p><strong>Error! Incorrect user name or password</strong></p>'
     else:
         cookie = SimpleCookie()
         sid = sha256(repr(time()).encode()).hexdigest()
         cookie['sid'] = sid
         session_store = DbfilenameShelf('sessions/sess_' + sid, writeback=True)
         session_store['authenticated'] = True
         session_store['username'] = username
         session_store.close()
         result = """
             <h2>Welcome back %s!</h2>
             <ul>
                 <li><a href="game.py">Play Now</a></li>
                 <li><a href="account.py">Your Scores &amp; Account Management</a></li>
                 <li><a href="logout.py">Logout</a></li>
             </ul>""" % username
         print(cookie)
     cursor.close()
     connection.close()
 except (db.Error, IOError):
     message = '<p>Sorry! We are experiencing problems at the moment. Please try again later.</p>'
import cPickle as pkl
from rasengan import groupby
PFX = get_pfx()
arg_parser = argparse.ArgumentParser(description='')
arg_parser.add_argument('--in_shelf',
                        default=PFX +
                        '/catpeople_clean_segmented_context.shelf',
                        type=str)
arg_parser.add_argument('--parsefn',
                        default=PFX + '/catpeople.parse.gz',
                        type=str)
arg_parser.add_argument('--parse_pkl',
                        default=PFX + '/catpeople.parse.pkl',
                        type=str)
args = arg_parser.parse_args()
catpeople = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
TM = catpeople['__TOKEN_MAPPER__']
labelmap = get_labelmap()
ctmap = get_coarse_tagmap()
ftmap = get_fine_tagmap()
f = gzip.GzipFile(fileobj=proj_open(args.parsefn))


def get(e):
    e = e.split('\t')
    return [e[1], int(e[6]), e[7], e[3], e[4]]


PARSES = {}
for parse in groupby(f):
    token, parent, labels, ctags, ftags = zip(*[get(r) for r in parse])
Beispiel #31
0
 def __init__(self, *args, **kwargs):
     DbfilenameShelf.__init__(self, *args, **kwargs)
Beispiel #32
0
 def sync(self):
     # noinspection PyUnresolvedReferences
     self._debug(f'Synchronizing...', header=f'Cache "{self.filename}"')
     DbfilenameShelf.sync(self)
     # noinspection PyUnresolvedReferences
     self._debug('Synchronized', header=f'Cache "{self.filename}"')
 def __init__(self, filename):
     DbfilenameShelf.__init__(self,filename)
Beispiel #34
0
 def __delitem__(self, name):
     key = name.encode('utf8')
     if DbfilenameShelf.has_key(self,key):
         DbfilenameShelf.__delitem__(self, key)
         self.sync()
Beispiel #35
0
result = """
   <section>
       <p>You are not logged in.</p>
       <p>
           <a href="login.py">Login</a> &vert; <a href="accounts/register.py">Register</a>
       </p>
   </section>"""

try:
    cookie = SimpleCookie()
    http_cookie_header = environ.get('HTTP_COOKIE')
    if http_cookie_header:
        cookie.load(http_cookie_header)
        if 'sid' in cookie:
            sid = cookie['sid'].value
            session_store = DbfilenameShelf('sessions/sess_' + sid,
                                            writeback=False)
            if session_store.get('authenticated'):
                message = ''
                form_data = FieldStorage()
                username = session_store.get('username')
                form = """<p>
                    Hey, %s. Sorry to see you go.
                </p>
                <p>
                    <strong>Warning! This action is permenant.</strong> All of your scores will be lost.
                </p>
                <form action="delete_account.py" method="post">
                    <label for="pass1">Enter password: </label>
                    <input type="password" name="pass1" id="pass1" placeholder="Enter password" required />
                    <label for="pass2">Reenter password: </label>
                    <input type="password" name="pass2" id="pass2" placeholder="Reenter password" required />
Beispiel #36
0
 def __getitem__(self, name):
     key = name.encode('utf8')
     value = DbfilenameShelf.__getitem__(self, key)
     return value
Beispiel #37
0
 def _open(self):
     DbfilenameShelf.__init__(
         self,
         os.path.join(self.get_database_path(self.config["path"]),
                      "shelfdb.db"),
     )
Beispiel #38
0
def get_eye_position(shelf: shelve.DbfilenameShelf):
    return shelf.get(EYE_POSITION, 0)
Beispiel #39
0
 def __setitem__(self, key, value):
     DbfilenameShelf.__setitem__(self, key, value)
     self.sync_now()
Beispiel #40
0
 def __init__(self, filename, protocol=2, writeback=True):
     DbfilenameShelf.__init__(self, filename, protocol=protocol, writeback=writeback)
Beispiel #41
0
 def sync_now(self):
     filename = self.filename
     self.close()
     DbfilenameShelf.__init__(self, filename)
Beispiel #42
0
 def __setitem__(self, key, value):
     if isinstance(key, int):
         key = str(key)
     DbfilenameShelf.__setitem__(self, key, value)
     self.sync()
Beispiel #43
0
 def _reset(self):
     # noinspection PyUnresolvedReferences
     DbfilenameShelf.__init__(self,
                              filename=self.filename,
                              flag='n',
                              writeback=self.in_memory_cache)
Beispiel #44
0
 def __getitem__(self, key):
     if isinstance(key, int):
         key = str(key)
     return DbfilenameShelf.__getitem__(self, key)
Beispiel #45
0
import argparse
import sys, os
arg_parser = argparse.ArgumentParser(
    description='Remove junk from catpeople wikimic')
arg_parser.add_argument('--seed', default=0, type=int, help='Default={0}')
arg_parser.add_argument('--MAX_CHAR_IN_SENT', default=1000, type=int)
PDIR = ('/export/b15/prastog3' if os.uname()[1] == 'b15' else 'data/')
arg_parser.add_argument('--in_shelf',
                        default='%s/catpeople_wikilink_mentions.shelf' % PDIR,
                        type=str)
arg_parser.add_argument('--out_shelf',
                        default='%s/catpeople_clean_segmented_context.shelf' %
                        PDIR,
                        type=str)
args = arg_parser.parse_args()
in_shelf = DbfilenameShelf(args.in_shelf, protocol=-1, flag='r')
out_shelf = DbfilenameShelf(args.out_shelf, protocol=-1)
urls = in_shelf['__URL_LIST__']

PAT_TOKENIZER = get_tokenizer()
TOKEN_MAPPER = TokenMapper()
MAX_CHAR_IN_SENT = args.MAX_CHAR_IN_SENT
import re
MIDDLE_NAME_REGEX = re.compile('[A-Z][^ ]*? [A-Z]\. [A-Z]')
for url_idx, url in enumerate(urls):
    print >> sys.stderr, ('Done: %.3f \r' %
                          (float(url_idx) * 100 / len(urls))),
    mentions = in_shelf[url]

    out_mentions = []
    for mention in mentions:
Beispiel #46
0
 def __delitem__(self, key):
     if isinstance(key, int):
         key = str(key)
     DbfilenameShelf.__delitem__(self, key)
     self.sync()