Python SqliteDict.close Beispiele, sqlitedict.SqliteDict.close Python Beispiele

Beispiel #1

0

Datei anzeigen

    def assertIterationDataRecorded(self, expected, tolerance, root):
        if self.comm.rank != 0:
            return

        db = SqliteDict(self.filename, self.tablename)
        _assertIterationDataRecorded(self, db, expected, tolerance)
        db.close()

Beispiel #2

0

Datei anzeigen

def test_sqlitedict_write(text):
    d = SqliteDict(f'debug_{datasize}.sqlite')
    for j in range(3):
        for i, line in enumerate(text):
            d[str(i + j * len(text))] = line
    d.commit()
    d.close()

Beispiel #3

0

Datei anzeigen

Datei: test_sqlite.py Projekt: kmarsteller/OpenMDAO

    def assertMetadataRecorded(self, expected):
        if self.comm.rank != 0:
            return

        db = SqliteDict(self.filename, self.tablename_metadata)
        _assertMetadataRecorded(self, db, expected)
        db.close()

Beispiel #4

0

Datei anzeigen

Datei: efa.py Projekt: deti/efa

def adjust_evernote_font():
    """
    Call for Evernote
    """
    note_info = SqliteDict(conf.db.db_file, autocommit=True)

    notes_in_evernote = list()
    for note in get_notes(get_notebooks()):
        guid = note.guid
        notes_in_evernote.append(guid)
        if guid not in note_info.keys() \
                or note_info[guid][FONT_SIZE] != conf.font_size \
                or note_info[guid][LINE_HEIGHT] != conf.line_height:
            adjust_note(note)
            note_info[guid] = {FONT_SIZE: conf.font_size,
                               LINE_HEIGHT: conf.line_height}

    guids_to_forget = [guid for guid in note_info.keys()
                       if guid not in notes_in_evernote]

    for guid in guids_to_forget:
        logging.debug("Delete guid from DB: {}".format(guid))
        del note_info[guid]

    note_info.close()

Beispiel #5

0

Datei anzeigen

class EmbeddingIndexer(object):
    def __init__(self, embedding_file, index_file):
        self.embedding_file = embedding_file
        self.index_file = index_file
        logger.info("Input path: " + self.embedding_file)
        logger.info("Index path: " + self.index_file)
        self.embedding = SqliteDict(os.path.join(self.index_file, EMBEDDING),
                                    autocommit=True)

    def iterator(self):
        with open(self.embedding_file) as f:
            for line in f:
                tokens = line.strip().split(" ")
                if len(tokens) == 2:
                    continue
                uri = tokens[0]
                embedding = np.array(tokens[1:], dtype=np.float32)
                yield (uri, embedding)

    def run(self):
        count = 0
        for key, value in self.iterator():
            self.embedding[key] = value
            count += 1
            if count % 20000 == 0:
                self.embedding.commit()
                logger.info("[{}] {} index added.".format(
                    datetime.datetime.now(), count))
        self.embedding.close()

Beispiel #6

0

Datei anzeigen

Datei: app.py Projekt: Rob192/pdf_api

 def post(self):
     data = {"success": False}
     stats_dict = SqliteDict(
         './api_stats.sqlite',
         autocommit=True)  # TODO: properly implement stats
     # output_types = ["pseudonymized", "tagged", "conll"]
     """Upload a file."""
     # try:
     file = request.files['file']  #get file in the request
     if file and self.allowed_file(file.filename):
         filename = secure_filename(
             file.filename)  #make sure we have a proper filename
         logger.info(f'**found {filename}')
         full_filename = UPLOAD_DIRECTORY / filename
         file.save(full_filename)  #saves pdf in folder
         pdf2txt(full_filename)  #call pdf2txt on pdf
         with open(full_filename.with_suffix('.txt'), 'r',
                   encoding='utf-8') as f:
             output = f.read()
         os.remove(full_filename)
         os.remove(full_filename.with_suffix('.txt'))
         data["text"] = str(output)
         data["success"] = True
         # TODO : add treatment for tabs
         # TODO : add spell checks
     stats_dict.close()
     return data

Beispiel #7

0

Datei anzeigen

 def __test_irregular_tablenames(tablename):
     filename = ':memory:'
     db = SqliteDict(filename, tablename=tablename)
     db['key'] = 'value'
     db.commit()
     self.assertEqual(db['key'], 'value')
     db.close()

Beispiel #8

0

Datei anzeigen

Datei: sqllite.py Projekt: wonabru/chainnet

class CDataBase(object):
    def __init__(self):
        try:
            self.close()
        except:
            pass
        self.mydict = SqliteDict('./DB/my_db.sqlite', autocommit=True)
        self.show()

    def set(self, key, value):
        self.mydict[key] = value

    def get(self, key):
        if key in self.mydict.keys():
            ret = self.mydict[key]
        else:
            ret = None
        return ret

    def show(self, start_with=''):
        for key, value in self.mydict.items():
            if key.find(start_with):
                print(key, '\t', value, '\n')

    def clear(self):
        self.mydict.clear()

    def close(self):
        self.mydict.close()

Beispiel #9

0

Datei anzeigen

def create_keywords_from_url(url_db):
    url_keywords = dict()

    db = SqliteDict(url_db, autocommit=False)
    urls = db.keys()
    lemmatizer = WordNetLemmatizer()

    for url in urls:
        entity = url.split('/')[-1]

        if "–" in entity:
            words = list()
            first_words = entity.split('_')
            for word in first_words:
                words.extend(word.split('–'))
        elif "-" in entity:
            words = list()
            first_words = entity.split('_')
            for word in first_words:
                words.extend(word.split('-'))
        else:
            words = entity.split('_')

        keywords = set()
        for word in words:
            prepro = word.strip(',').strip('.').strip('(').strip(')').lower()
            #keywords.add(prepro)
            keywords.add(lemmatizer.lemmatize(prepro))

        url_keywords[url] = keywords

    db.close()

    return url_keywords

Beispiel #10

0

Datei anzeigen

Datei: store.py Projekt: jcoelho93/shell-database

 def __init__(self, filename: Path = None):
     self.store = filename or DataStore.DEFAULT_FILE
     if not self.store.parent.exists():
         self.store.parent.mkdir(parents=True, exist_ok=True)
     sqlite = SqliteDict(self.store)
     sqlite.close()
     self.store.chmod(0o600)

Beispiel #11

0

Datei anzeigen

def test_sqlitedict_read(datasize):
    d = SqliteDict(f'debug_{datasize}.sqlite')
    for j in range(3):
        for i in range(0, datasize):
            a = d[str(i + j * len(text))]
            assert a == text[i]
    d.close()

Beispiel #12

0

Datei anzeigen

Datei: backend.py Projekt: vivi489/multiprobe_lsh

def lookup_knn(query,
               top_k=20,
               path_q2sig='./demo_data/q2sig.sqldict',
               path_sig2buckets='./demo_data/sig2buckets',
               path_feature_map='./demo_data/feature_map.sqldict'):
    candidates = SortedSet(key=lambda t: (-t[1], t[0]))
    with SqliteDict(path_q2sig) as db_q2sig:
        sigs = db_q2sig[query]
    fmap = SqliteDict(path_feature_map)
    bucket_maps = []
    for path in glob.glob(f"{path_sig2buckets}*"):
        bucket_maps.append(SqliteDict(path))
    for db_sig2buckets in bucket_maps:
        for sig in sigs:
            bucket = db_sig2buckets.get(sig, None)
            if bucket is None: continue
            for q in bucket:
                if q == query:
                    continue
                sim = pairwise_cosine_similarity(fmap[q], fmap[query])
                candidates.add((q, sim))
    candidates = candidates[:top_k]
    fmap.close()
    for db_sig2buckets in bucket_maps:
        db_sig2buckets.close()
    return candidates

Beispiel #13

0

Datei anzeigen

Datei: zipgun.py Projekt: brad/zipgun

def _import_sql_data(data_dir):
    file_path = os.path.join(data_dir, DATA_FILE)

    # Find out what format we have
    with sqlite3.connect(file_path) as conn:
        try:
            conn.execute('select count(*) from zipgun_info')
            zipgun_info = SqliteDict(file_path, tablename='zipgun_info')
            version = zipgun_info.get('version', 0)
        except sqlite3.OperationalError:
            version = 0

    if version == 0:
        country_postal_codes = SqliteDict(file_path)
    elif version == 1:
        country_postal_codes = {}
        for country_code in zipgun_info['country_codes']:
            if country_code in country_postal_codes:
                raise ValueError('Duplicate entry found for {}'.format(
                    country_code))
            country_postal_codes[country_code] = SqliteDict(
                file_path, tablename='zg_{}'.format(country_code),
                journal_mode='OFF')
        zipgun_info.close()
    else:
        raise ValueError('Unknown data file version {}'.format(version))
    return country_postal_codes

Beispiel #14

0

Datei anzeigen

Datei: zipgun.py Projekt: swinter2011/pledgeservice

def _import_sql_data(data_dir):
    import sqlite3
    from sqlitedict import SqliteDict

    file_path = os.path.join(data_dir, DATA_FILE)

    # Find out what format we have
    with sqlite3.connect(file_path) as conn:
        try:
            conn.execute('select count(*) from zipgun_info')
            zipgun_info = SqliteDict(file_path, tablename='zipgun_info')
            version = zipgun_info.get('version', 0)
        except sqlite3.OperationalError:
            version = 0

    if version == 0:
        country_postal_codes = SqliteDict(file_path)
    elif version == 1:
        country_postal_codes = {}
        for country_code in zipgun_info['country_codes']:
            if country_code in country_postal_codes:
                raise ValueError(
                    'Duplicate entry found for {}'.format(country_code))
            country_postal_codes[country_code] = SqliteDict(
                file_path,
                tablename='zg_{}'.format(country_code),
                journal_mode='OFF')
        zipgun_info.close()
    else:
        raise ValueError('Unknown data file version {}'.format(version))
    return country_postal_codes

Beispiel #15

0

Datei anzeigen

Datei: api.py Projekt: agking10/IR_FinalProject

def clear_db(db_path_shadow: str) -> None:
    doc_vecs_db = SqliteDict(db_path_shadow)
    print("Clearing db {}".format(db_path_shadow))
    for key in tqdm(doc_vecs_db.keys()):
        del doc_vecs_db[key]
    doc_vecs_db.commit()
    doc_vecs_db.close()

Beispiel #16

0

Datei anzeigen

    def test_readonly(self):
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname)
        orig_db['key'] = 'value'
        orig_db['key_two'] = 2
        orig_db.commit()
        orig_db.close()

        readonly_db = SqliteDict(filename=fname, flag='r')
        self.assertTrue(readonly_db['key'] == 'value')
        self.assertTrue(readonly_db['key_two'] == 2)

        def attempt_write():
            readonly_db['key'] = ['new_value']

        def attempt_update():
            readonly_db.update(key='value2', key_two=2.1)

        def attempt_delete():
            del readonly_db['key']

        def attempt_clear():
            readonly_db.clear()

        def attempt_terminate():
            readonly_db.terminate()

        attempt_funcs = [
            attempt_write, attempt_update, attempt_delete, attempt_clear,
            attempt_terminate
        ]

        for func in attempt_funcs:
            with self.assertRaises(RuntimeError):
                func()

Beispiel #17

0

Datei anzeigen

def persistence_load(db_path=config.WN_FEATURE_CACHE_PATH):
    p_dict = {
        'hypernym_stems_dict': dict(),
        'hyponym_stems_dict': dict(),
        'hyper_lvl_dict': dict(),
        'hypo_lvl_dict': dict(),
        'ant_dict': dict(),
        'em_lemmas_dict': dict(),
    }

    # if em_dict:
    #     p_dict['em_dict'] = dict()

    for dict_name in p_dict.keys():
        print("Loading Persistent WN Feature Dict:", dict_name)
        if dict_name != 'em_dict':
            in_db_dict = SqliteDict(str(db_path / dict_name),
                                    autocommit=False,
                                    tablename='the_table',
                                    flag='c')
            for key, v in tqdm(in_db_dict.items()):
                p_dict[dict_name][key] = v
            in_db_dict.close()
        elif dict_name == 'em_dict':
            in_db_dict = SqliteDict(str(db_path / dict_name),
                                    autocommit=False,
                                    tablename='the_table',
                                    flag='c')
            for key, v in tqdm(in_db_dict.items()):
                p_dict[dict_name][key] = v
            in_db_dict.close()

    return p_dict

Beispiel #18

0

Datei anzeigen

Datei: filenames_database.py Projekt: afrazkhan/cstash

    def search(self, obj, exact=False, db=None):
        """
        Search the database for partial matches of [obj], and return a list of matches
        in the tuple form:

        ("obj", { "filename_hash": string,
                  "cryptographer": string,
                  "key": string,
                  "storage_provider": string,
                  "bucket": string,
                  "file_hash": string } )

        If [exact] == True, then only exact matches will be returned. Since there should
        only ever be a single exact match for a path in the DB, a CstashCriticalException
        will be thrown if more than a single element is in the resulting list. This shouldn't
        be possible anyway, since the DB is a key/value store, but it's a safety measure.
        """

        db = db or self.db
        db_connection = SqliteDict(db, autocommit=True, flag='r')
        if exact is True:
            keys = [(k, db_connection[k]) for k in db_connection.keys() if obj == k]
        elif obj is None:
            keys = [(k, db_connection[k]) for k in db_connection.keys()]
        else:
            keys = [(k, db_connection[k]) for k in db_connection.keys() if obj in k]

        if exact is True and len(keys) > 1:
            raise exceptions.CstashCriticalException(message=(f"Found more than a single match "
                "for {obj} in the database:\n\n{keys}")) # pylint: disable=bad-continuation

        db_connection.close()

        return keys

Beispiel #19

0

Datei anzeigen

Datei: test_sqlite.py Projekt: kmarsteller/OpenMDAO

    def assertIterationDataRecorded(self, expected, tolerance, root):
        if self.comm.rank != 0:
            return

        db = SqliteDict(self.filename, self.tablename_iterations)
        _assertIterationDataRecorded(self, db, expected, tolerance)
        db.close()

Beispiel #20

0

Datei anzeigen

Datei: deliver.py Projekt: strategist922/shub-workflow

class SqliteDictDupesFilter(object):
    def __init__(self):
        """
        SqlteDict based dupes filter
        """
        self.dupes_db_file = tempfile.mktemp()
        self.__filter = None

    def __create_db(self):
        self.__filter = SqliteDict(self.dupes_db_file,
                                   flag='n',
                                   autocommit=True)

    def __contains__(self, element):
        if self.__filter is None:
            self.__create_db()
        return element in self.__filter

    def add(self, element):
        if self.__filter is None:
            self.__create_db()
        self.__filter[element] = '-'

    def close(self):
        if self.__filter is not None:
            try:
                self.__filter.close()
                os.remove(self.dupes_db_file)
            except:
                pass

Beispiel #21

0

Datei anzeigen

    def sqliteFileIO(self, aKey, Val=None):
        # distinguish between reading  and writing
        if Val is None:
            #is reading
            key_dict = SqliteDict(self.dict_sqlite, autocommit=True)
            global card_data
            try:
                card_data = key_dict[aKey]
                self.ids._bios.reg.text = '{}'.format(f'{card_data[0]}')
                self.ids._bios.serial_no.text = '{}'.format(f'{card_data[1]}')
                self.ids._bios.phone_no.text = '{}'.format(f'{card_data[2]}')
                self.ids._bios.card.text = '{}'.format(f'{card_data[-1]}')
                self.ids._status.stat.text = "[INFO] Verification complete"
                # remove the key value pair
                del key_dict[str(aKey)]
                #reset the status
            except KeyError:
                self.ids._scanner2.scan2.text = f"[{aKey}] Card unrecognized(Scan Key)"
            key_dict.close()

        else:
            #is writing
            key_dict = SqliteDict(self.dict_sqlite, autocommit=True)
            try:
                key_dict[aKey] = Val
            except KeyError:
                self.ids._scanner1.scan1.text = f"[{aKey}] Key already scanned"

            key_dict.close()

Beispiel #22

0

Datei anzeigen

    def assertMetadataRecorded(self, expected):
        if self.comm.rank != 0:
            return

        db = SqliteDict(self.filename, self.tablename)
        _assertMetadataRecorded(self, db, expected)
        db.close()

Beispiel #23

0

Datei anzeigen

def get_account(server: Guild, member: Member) -> int:
    """Return the account level for a given user.

    Intended for export to Cogs.
    """
    uid = str(member.id)
    sid = str(server.id)

    # Get a temporary instance of the main database
    database = SqliteDict(
        filename=f"db/{inst.database}",
        tablename="discord-bot",
        encode=json.dumps,
        decode=json.loads
    )

    if "accounts" not in database:
        database.close()
        raise KeyError("Database not initialized.")

    db_dict = database["accounts"]
    database.close()

    if sid not in db_dict:
        raise KeyError("Server has no accounts.")

    if uid not in db_dict[sid]:
        raise KeyError("User does not have an account for this server.")
    else:
        return db_dict[sid][uid]

Beispiel #24

0

Datei anzeigen

Datei: solver.py Projekt: ashawkey/CGT_Othello

 def solve_in_mem(self):
     database = {}
     self.solveAll(database)
     sqldb = SqliteDict(self.dbName, autocommit=True)
     for k, v in database.items():
         sqldb[k] = v.simplify()
     sqldb.close()
     print("[Solver] Graph saved to disk")

Beispiel #25

0

Datei anzeigen

Datei: QueryCache.py Projekt: In1quity/dibabel-js

class SessionState:
    @staticmethod
    def my_encode(obj):
        try:
            if isinstance(obj, list) and obj and isinstance(
                    obj[0], RevComment):
                enc_obj = [v.encode() for v in obj]
            else:
                enc_obj = obj
            return dumps(enc_obj, ensure_ascii=False)
        except TypeError:
            return sqlite3.Binary(
                zlib.compress(pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)))

    @staticmethod
    def my_decode(obj):
        try:
            obj = loads(obj)
        except ValueError:
            return pickle.loads(zlib.decompress(bytes(obj)))
        if isinstance(obj, list) and obj and isinstance(
                obj[0], dict) and 'comment' in obj[0]:
            obj = [RevComment.decode(v) for v in obj]
        return obj

    def __init__(self, cache_file: str, user_requested=False):
        self.user_requested = user_requested
        self.cache = SqliteDict(cache_file,
                                autocommit=True,
                                encode=self.my_encode,
                                decode=self.my_decode)
        self.session = Session()
        # noinspection PyTypeChecker
        self.session.mount(
            'https://',
            HTTPAdapter(
                max_retries=Retry(total=3,
                                  backoff_factor=0.1,
                                  status_forcelist=[500, 502, 503, 504])))
        self.sites = {}

    def __enter__(self):
        return self

    def __exit__(self, typ, value, traceback):
        self.cache.close()
        self.session.close()

    def get_site(self, domain: str) -> WikiSite:
        try:
            return self.sites[domain]
        except KeyError:
            # noinspection PyTypeChecker
            site = WikiSite(domain, self.session, domain == primary_domain)
            if self.user_requested:
                site.maxlag = None
            self.sites[domain] = site
            return site

Beispiel #26

0

Datei anzeigen

def get_doc(doc_id):
    doc_db = SqliteDict(os.path.join(db_path, "docs.db"))
    try:
        document = doc_db[doc_id]
    except KeyError:
        doc_db.close()
        return Response(status=404)
    doc_db.close()
    return jsonify(document)

Beispiel #27

0

Datei anzeigen

 def test_as_str(self):
     """Verify SqliteDict.__str__()."""
     # given,
     db = SqliteDict()
     # exercise
     db.__str__()
     # test when db closed
     db.close()
     db.__str__()

Beispiel #28

0

Datei anzeigen

Datei: process_data.py Projekt: kodiers/flask_udemy

def process_coins():
    coins_db = SqliteDict('./coins.db', autocommit=True)
    data = coins_db['coin_data']
    coins_db.close()
    all_coins = []
    for coin in data:
        coin["isover40"] = calculate_coin(coin['price_usd'])
        all_coins.append(coin)
    return all_coins

Beispiel #29

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

 def test_as_str(self):
     """Verify SqliteDict.__str__()."""
     # given,
     db = SqliteDict()
     # exercise
     db.__str__()
     # test when db closed
     db.close()
     db.__str__()

Beispiel #30

0

Datei anzeigen

Datei: filenames_database.py Projekt: afrazkhan/cstash

    def return_all_entries(self, db=None):
        """ Return a dict of the database """

        db = db or self.db
        db_connection = SqliteDict(db, autocommit=True, flag='r')
        entries = dict(db_connection)
        db_connection.close()

        return entries

Beispiel #31

0

Datei anzeigen

def set_query_results(query, k=20):
    query_map = SqliteDict(query_map_path)
    query_vec = query_map[query]
    query_map.close()
    query_db = SqliteDict(query_db_path)
    results = search(docs_tfidf, query_vec, hw2.cosine_sim)
    query_db[query] = results[:k]
    query_db.commit()
    query_db.close()

Beispiel #32

0

Datei anzeigen

Datei: api.py Projekt: agking10/IR_FinalProject

def add_docs_to_query_vector(query_vector: BagOfWordsVector, docs: list[int],
                             alpha: float) -> BagOfWordsVector:
    doc_db = SqliteDict(doc_vecs_db_path)
    for doc_id in docs:
        doc_vector = try_to_get_doc_vector_from_db(doc_id, doc_db)
        query_vector = add_vectors(query_vector,
                                   scalar_multiply(doc_vector, alpha))
    doc_db.close()
    return query_vector

Beispiel #33

0

Datei anzeigen

Datei: api.py Projekt: agking10/IR_FinalProject

def try_to_get_query_from_db(q: str) -> BagOfWordsVector:
    query_map = SqliteDict(query_map_path)
    try:
        query_vector = query_map[q]
        query_map.close()
        return query_vector
    except KeyError:
        query_map.close()
        raise HTTPException

Beispiel #34

0

Datei anzeigen

def criaTeste():
    mydict = SqliteDict('db/teste.sqlite', "mails", autocommit=True)
    mydict['1'] = "batata"
    mydict['2'] = "banana"
    mydict['3'] = "oi"
    mydict['4'] = "teste"
    for key, value in mydict.iteritems():
        print(key, value)
    print(len(mydict))  # etc... all dict functions work
    mydict.close()

Beispiel #35

0

Datei anzeigen

Datei: process_data.py Projekt: phennaux/Flask-Project

def process_coins():
    coins_db = SqliteDict('/home/phennaux115/Documents/Python-Flask/coins.db',
                          autocommit=True)
    data = coins_db["coin_data"]
    coins_db.close()
    all_coins = []
    for coin in data:
        coin["isover40"] = calculate_coin(coin["price_usd"])
        all_coins.append(coin)
    return all_coins

Beispiel #36

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

    def test_default_reuse_existing_flag_c(self):
        """Re-opening of a database does not destroy it."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname)
        orig_db['key'] = 'value'
        orig_db.commit()
        orig_db.close()

        next_db = SqliteDict(filename=fname)
        self.assertIn('key', next_db.keys())
        self.assertEqual(next_db['key'], 'value')

Beispiel #37

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

    def test_overwrite_using_flag_n(self):
        """Re-opening of a database with flag='c' destroys it all."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname, tablename='sometable')
        orig_db['key'] = 'value'
        orig_db.commit()
        orig_db.close()

        # verify,
        next_db = SqliteDict(filename=fname, tablename='sometable', flag='n')
        self.assertNotIn('key', next_db.keys())

Beispiel #38

0

Datei anzeigen

Datei: _note_test.py Projekt: MacHu-GWU/six-demon-bag

def basic_usage():
    """SqliteDict引擎会将任意的value转换为
    """
    mydict = SqliteDict("test.sqlite", autocommit=True)
    mydict["integer_value"] = 1
    mydict["real_value"] = 2.2
    mydict["text_value"] = "abc"
    mydict["date_value"] = date.today()
    mydict["datetime_value"] = datetime.now()
    
    # if you don't use with SqliteDict("test.sqlite") as mydict: ...
    # you have to close the connection explicitly
    mydict.close()

Beispiel #39

0

Datei anzeigen

Datei: test_sqlite.py Projekt: briantomko/OpenMDAO

    def assertDatasetEquals(self, expected, tolerance):
        # Close the file to ensure it is written to disk.
        self.recorder.close()
        # self.recorder.out = None

        sentinel = object()

        db = SqliteDict( self.filename, self.tablename )


        for coord, expect in expected:
            iter_coord = format_iteration_coordinate(coord)

            groupings = (
                ("Parameters", expect[0]),
                ("Unknowns", expect[1]),
                ("Residuals", expect[2])
            )

            #### Need to get the record with the key of 'iter_coord'
            actual_group = db[iter_coord]
            timestamp = actual_group['timestamp']

            self.assertTrue(self.t0 <= timestamp and timestamp <= self.t1 )

            for label, values in groupings:
                actual = actual_group[label]
                # If len(actual) == len(expected) and actual <= expected, then
                # actual == expected.
                self.assertEqual(len(actual), len(values))
                for key, val in values:
                    found_val = actual.get(key, sentinel)
                    if found_val is sentinel:
                        self.fail("Did not find key '{0}'".format(key))
                    
                    if isinstance(found_val, _ByObjWrapper):
                        found_val = found_val.val

                    try:
                        assert_rel_error(self, found_val, val, tolerance)
                    except TypeError as error:
                        self.assertEqual(found_val, val)

            del db[iter_coord]
            ######## delete the record with the key 'iter_coord'

        # Having deleted all found values, the file should now be empty.
        ###### Need a way to get the number of records in the main table
        self.assertEqual(len(db), 0)

        db.close()

Beispiel #40

0

Datei anzeigen

Datei: test_sqlite.py Projekt: kishenr12/OpenMDAO

    def assertDatasetEquals(self, expected, tolerance):
        # Close the file to ensure it is written to disk.
        self.recorder.close()
        # self.recorder.out = None

        sentinel = object()

        db = SqliteDict( self.filename, self.tablename )

        ###### Need a way to get a list of the group_names in the order in which they were written and put it in  a variable named order
        order = db['order']
        del db['order']

        for coord, expect in expected:
            iter_coord = format_iteration_coordinate(coord)

            self.assertEqual(order.pop(0), iter_coord)

            groupings = (
                ("Parameters", expect[0]),
                ("Unknowns", expect[1]),
                ("Residuals", expect[2])
            )

            #### Need to get the record with the key of 'iter_coord'
            actual_group = db[iter_coord]

            for label, values in groupings:
                actual = actual_group[label]
                # If len(actual) == len(expected) and actual <= expected, then
                # actual == expected.
                self.assertEqual(len(actual), len(values))
                for key, val in values:
                    found_val = actual.get(key, sentinel)
                    if found_val is sentinel:
                        self.fail("Did not find key '{0}'".format(key))
                    assert_rel_error(self, found_val, val, tolerance)
            del db[iter_coord]
            ######## delete the record with the key 'iter_coord'

        # Having deleted all found values, the file should now be empty.
        ###### Need a way to get the number of records in the main table
        self.assertEqual(len(db), 0)

        # As should the ordering.
        self.assertEqual(len(order), 0)

        db.close()

Beispiel #41

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

    def test_irregular_tablenames(self):
        """Irregular table names need to be quoted"""
        db = SqliteDict(':memory:', tablename='9nine')
        db['key'] = 'value'
        db.commit()
        self.assertEqual(db['key'], 'value')
        db.close()

        db = SqliteDict(':memory:', tablename='outer space')
        db['key'] = 'value'
        db.commit()
        self.assertEqual(db['key'], 'value')
        db.close()

        with self.assertRaisesRegexp(ValueError, r'^Invalid tablename '):
            SqliteDict(':memory:', '"')

Beispiel #42

0

Datei anzeigen

Datei: test_sqlite.py Projekt: Satadru-Roy/OpenMDAO

    def test_driver_records_model_viewer_data(self):
        size = 3

        prob = Problem(Group(), impl=impl)

        G1 = prob.root.add('G1', ParallelGroup())
        G1.add('P1', IndepVarComp('x', np.ones(size, float) * 1.0))
        G1.add('P2', IndepVarComp('x', np.ones(size, float) * 2.0))

        prob.root.add('C1', ABCDArrayComp(size))

        prob.root.connect('G1.P1.x', 'C1.a')
        prob.root.connect('G1.P2.x', 'C1.b')

        prob.driver.add_recorder(self.recorder)

        self.recorder.options['record_metadata'] = True
        prob.setup(check=False)

        prob.cleanup()

        # do some basic tests to make sure the model_viewer_data was recorded correctly
        if self.comm.rank == 0:
            db = SqliteDict(self.filename, self.tablename_metadata)
            model_viewer_data = db['model_viewer_data']
            tr = model_viewer_data['tree']
            self.assertEqual(set(['name', 'type', 'subsystem_type', 'children']), set(tr.keys()))

            names = []
            for ch1 in tr['children']:
                # each is an ordereddict
                names.append(ch1["name"] )
                for ch2 in ch1["children"]:
                    names.append(ch2["name"] )
                    if "children" in ch2:
                        for ch3 in ch2["children"]:
                            names.append(ch3["name"] )

            expected_names = ['G1', 'P1', 'x', 'P2', 'x', 'C1', 'a', 'b',
                        'in_string', 'in_list', 'c', 'd', 'out_string', 'out_list']

            self.assertEqual( sorted(expected_names), sorted(names))

            cl = model_viewer_data['connections_list']
            for c in cl:
                self.assertEqual(set(['src', 'tgt']), set(c.keys()))
            db.close()

Beispiel #43

0

Datei anzeigen

Datei: test_sqlite.py Projekt: robfalck/OpenMDAO

    def test_recording_model_viewer_data(self):
        prob = Problem()
        prob.root = ConvergeDiverge()
        prob.driver.add_recorder(self.recorder)
        self.recorder.options["record_metadata"] = True
        prob.setup(check=False)
        prob.cleanup()  # closes recorders

        # do some basic tests to make sure the model_viewer_data was recorded
        db = SqliteDict(filename=self.filename, flag="r", tablename="metadata")
        model_viewer_data = db["model_viewer_data"]
        tr = model_viewer_data["tree"]
        self.assertEqual(set(["name", "type", "subsystem_type", "children"]), set(tr.keys()))
        cl = model_viewer_data["connections_list"]
        for c in cl:
            self.assertEqual(set(["src", "tgt"]), set(c.keys()))
        db.close()

Beispiel #44

0

Datei anzeigen

Datei: test_sqlite.py Projekt: robfalck/OpenMDAO

    def test_recording_system_metadata(self):
        prob = Problem()
        prob.root = ConvergeDiverge()
        prob.root.add_metadata("string", "just a test")
        prob.root.add_metadata("ints", [1, 2, 3])
        prob.driver.add_recorder(self.recorder)
        self.recorder.options["record_metadata"] = True
        prob.setup(check=False)
        prob.cleanup()  # closes recorders

        # check the system metadata recording
        sqlite_metadata = SqliteDict(filename=self.filename, flag="r", tablename="metadata")
        system_metadata = sqlite_metadata["system_metadata"]
        self.assertEqual(len(system_metadata), 2)
        self.assertEqual(system_metadata["string"], "just a test")
        self.assertEqual(system_metadata["ints"], [1, 2, 3])
        sqlite_metadata.close()

Beispiel #45

0

Datei anzeigen

Datei: _note_test.py Projekt: MacHu-GWU/six-demon-bag

def test_mutiple_thread():
    """多个进程访问数据库的时候, 最好只有一个有写操作。如果需要多个进程进行写操作, 则需要每次在
    写操作后进行commit, 也就是说需要打开autocommit
    """
    dict1 = SqliteDict("test.sqlite", autocommit=False) # if False, then mutiple thread writing is
    dict2 = SqliteDict("test.sqlite", autocommit=False) # not allowed
    print(dict1["integer_value"])
    print(dict2["integer_value"])

#     dict1["integer_value"] = 2
    print(dict1["integer_value"], dict2["integer_value"])
    
#     dict2["integer_value"] = 3
    print(dict1["integer_value"], dict2["integer_value"])
    
    dict1.close()
    dict2.close()

Beispiel #46

0

Datei anzeigen

Datei: generate_db.py Projekt: brad/zipgun

def _persist_v1(file_path, zg):
    print 'Creating meta db...'
    zipgun_info = SqliteDict(
        file_path, tablename='zipgun_info', autocommit=False)
    zipgun_info['version'] = 1
    zipgun_info['country_codes'] = zg.country_postal_codes.keys()
    zipgun_info.commit()

    for country_code in zg.country_postal_codes:
        print 'Creating {} db...'.format(country_code)
        country_data = SqliteDict(
            file_path, tablename='zg_{}'.format(country_code),
            autocommit=False)
        country_data.update(zg.country_postal_codes[country_code])
        country_data.commit()
        time.sleep(1.0)                   # Pretty bullshit
        country_data.close()
    zipgun_info.close()

Beispiel #47

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

class SqliteDictJsonSerializationTest(unittest.TestCase):
    def setUp(self):
        self.fname = norm_file('tests/db-json/sqlitedict.sqlite')
        self.db = SqliteDict(
            filename=self.fname, tablename='test', encode=json.dumps, decode=json.loads
        )

    def tearDown(self):
        self.db.close()
        os.unlink(self.fname)
        os.rmdir(os.path.dirname(self.fname))

    def get_json(self, key):
        return self.db.conn.select_one('SELECT value FROM test WHERE key = ?', (key,))[0]

    def test_int(self):
        self.db['test'] = -42
        assert self.db['test'] == -42
        assert self.get_json('test') == '-42'

    def test_str(self):
        test_str = u'Test \u30c6\u30b9\u30c8'
        self.db['test'] = test_str
        assert self.db['test'] == test_str
        assert self.get_json('test') == r'"Test \u30c6\u30b9\u30c8"'

    def test_bool(self):
        self.db['test'] = False
        assert self.db['test'] is False
        assert self.get_json('test') == 'false'

    def test_none(self):
        self.db['test'] = None
        assert self.db['test'] is None
        assert self.get_json('test') == 'null'

    def test_complex_struct(self):
        test_value = {
            'version': 2.5,
            'items': ['one', 'two'],
        }
        self.db['test'] = test_value
        assert self.db['test'] == test_value
        assert self.get_json('test') == json.dumps(test_value)

Beispiel #48

0

Datei anzeigen

Datei: scribe.py Projekt: braingineer/baal

class Scribe(object):
    def __init__(self, location, table_name, exp_name):
        filename = "{}/scribe.sqlite".format(location)
        self.book = SqliteDict(filename, autocommit=True, tablename=table_name)
        unique_id = datetime.now().strftime("date_%m.%d_time_%H.%M")
        self.exp_name = exp_name+"_"+unique_id
        self.observation_index = 0


    def record(self, value, type="general"):
        key = "{}; {}; {}".format(self.exp_name, self.observation_index, type)
        self.book[key] = value
        self.observation_index += 1

    observe = record #sometimes i forget which

    def lookup(self, type=None, exp_name=None, ret_sorted=False, strip_keys=False):
        type_func = lambda *args: True
        name_func = lambda *args: True

        if type:
            type_func = lambda x: x[2] == type

        if exp_name:
            name_func = lambda x: exp_name in x[0]

        key_func = lambda x: type_func(x) and name_func(x)
        unpack = lambda x: [f(x.strip()) for f,x in zip([str,int,str],x.split(";"))]
        items = {k:v for k,v in self.book.iteritems() if key_func(unpack(k))}
        if ret_sorted:
            return self.sort_results(items, strip_keys)
        return items

    def sort_results(self, result_dict, only_val_return=False):
        unpack = lambda x: [f(x.strip()) for f,x in zip([str,int,str],x.split(";"))]
        ranker = lambda x: unpack(x[0])[1]
        sorted_items = sorted(result_dict.items(), key=ranker)
        if only_val_return:
            return [v for k,v in sorted_items]
        return sorted_items

    def close(self):
        self.book.close()

Beispiel #49

0

Datei anzeigen

Datei: file_cache.py Projekt: WURFL/wurfl-cloud-client-python

class FileCache(CacheInterface):
    def __init__(self, config):
        CacheInterface.__init__(self, config)
        self.db = SqliteDict(self.config[u"cache"][u"file"], autocommit=True)
        self.expiration = self.config[u"cache"].get(u"expiration", 86400)
        def closer():
            try:
                self.db.close()
            except Exception:
                logger.exception("Exception closing file cache")
        atexit.register(closer)

    def get(self, key):
        if int(self.db[key + "_expiration"]) - time.time() <= 0:
            raise KeyError("cache key expired")
        return self.db[key]

    def set(self, key, val):
        self.db[key] = val
        self.db[key + "_expiration"] = str(int(time.time()) + self.expiration)

Beispiel #50

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

    def test_overwrite_using_flag_w(self):
        """Re-opening of a database with flag='w' destroys only the target table."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db_1 = SqliteDict(filename=fname, tablename='one')
        orig_db_1['key'] = 'value'
        orig_db_1.commit()
        orig_db_1.close()

        orig_db_2 = SqliteDict(filename=fname, tablename='two')
        orig_db_2['key'] = 'value'
        orig_db_2.commit()
        orig_db_2.close()

        # verify, when re-opening table space 'one' with flag='2', we destroy
        # its contents.  However, when re-opening table space 'two' with
        # default flag='r', its contents remain.
        next_db_1 = SqliteDict(filename=fname, tablename='one', flag='w')
        self.assertNotIn('key', next_db_1.keys())

        next_db_2 = SqliteDict(filename=fname, tablename='two')
        self.assertIn('key', next_db_2.keys())

Beispiel #51

0

Datei anzeigen

Datei: test_core.py Projekt: RaRe-Technologies/sqlitedict

    def test_readonly(self):
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname)
        orig_db['key'] = 'value'
        orig_db['key_two'] = 2
        orig_db.commit()
        orig_db.close()

        readonly_db = SqliteDict(filename=fname, flag = 'r')
        self.assertTrue(readonly_db['key'] == 'value')
        self.assertTrue(readonly_db['key_two'] == 2)

        def attempt_write():
            readonly_db['key'] = ['new_value']

        def attempt_update():
            readonly_db.update(key = 'value2', key_two = 2.1)

        def attempt_delete():
            del readonly_db['key']

        def attempt_clear():
            readonly_db.clear()

        def attempt_terminate():
            readonly_db.terminate()

        attempt_funcs = [attempt_write, 
                         attempt_update, 
                         attempt_delete,
                         attempt_clear,
                         attempt_terminate]

        for func in attempt_funcs:
            with self.assertRaises(RuntimeError):
                func()

Beispiel #52

0

Datei anzeigen

Datei: test_sqlite.py Projekt: aa-savelyev/OpenMDAO

 def assertIterationDataRecorded(self, expected, tolerance):
     db = SqliteDict( self.filename, self.tablename )
     _assertIterationDataRecorded(self, db, expected, tolerance)
     db.close()

Beispiel #53

0

Datei anzeigen

Datei: simserver.py Projekt: jannson/gensim-simserver

class SimIndex(gensim.utils.SaveLoad):
    """
    An index of documents. Used internally by SimServer.

    It uses the Similarity class to persist all document vectors to disk (via mmap).
    """
    def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS):
        """
        Spill index shards to disk after every `shardsize` documents.
        In similarity queries, return only the `topsims` most similar documents.
        """
        self.fname = fname
        self.shardsize = int(shardsize)
        self.topsims = int(topsims)
        self.id2pos = {} # map document id (string) to index position (integer)
        self.pos2id = {} # reverse mapping for id2pos; redundant, for performance
        self.id2sims = SqliteDict(self.fname + '.id2sims', journal_mode=JOURNAL_MODE) # precomputed top similar: document id -> [(doc_id, similarity)]
        self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None,
            num_best=None, num_features=num_features, shardsize=shardsize)
        self.length = 0

    def save(self, fname):
        tmp, self.id2sims = self.id2sims, None
        super(SimIndex, self).save(fname)
        self.id2sims = tmp


    @staticmethod
    def load(fname):
        result = gensim.utils.SaveLoad.load(fname)
        result.fname = fname
        result.check_moved()
        result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE)
        return result


    def check_moved(self):
        output_prefix = self.fname + '.idx'
        if self.qindex.output_prefix != output_prefix:
            logger.info("index seems to have moved from %s to %s; updating locations" %
                (self.qindex.output_prefix, output_prefix))
            self.qindex.output_prefix = output_prefix
            self.qindex.check_moved()


    def close(self):
        "Explicitly release important resources (file handles, db, ...)"
        try:
            self.id2sims.close()
        except:
            pass
        try:
            del self.qindex
        except:
            pass


    def terminate(self):
        """Delete all files created by this index, invalidating `self`. Use with care."""
        try:
            self.id2sims.terminate()
        except:
            pass
        import glob
        for fname in glob.glob(self.fname + '*'):
            try:
                os.remove(fname)
                logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s: %s" % (fname, e))
        for val in self.__dict__.keys():
            try:
                delattr(self, val)
            except:
                pass

Beispiel #54

0

Datei anzeigen

Datei: simserver.py Projekt: jannson/gensim-simserver

class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks
        self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm
        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)


    def location(self, name):
        return os.path.join(self.basename, name)


    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate() # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(journal_mode=JOURNAL_MODE) # buffer defaults to a random location in temp
        self.fresh_docs.sync()


    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
#            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" % docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()


    @gensim.utils.synchronous('lock_update')
    def train(self, corpus=None, method='auto', clear_buffer=True, params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning("too few training documents; using simple log-entropy model instead of latent semantic indexing")
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)


    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)


    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate() # delete old files
        self.fresh_index = None
        self.flush(save_index=True)


    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr + "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
            self.model = None

Beispiel #55

0

Datei anzeigen

Datei: sqlite_recorder.py Projekt: aa-savelyev/OpenMDAO

class SqliteRecorder(BaseRecorder):
    """ Recorder that saves cases in an SQLite dictionary.

    Args
    ----
    sqlite_dict_args : dict
        Dictionary lf any additional arguments for the SQL db.

    Options
    -------
    options['record_metadata'] :  bool(True)
        Tells recorder whether to record variable attribute metadata.
    options['record_unknowns'] :  bool(True)
        Tells recorder whether to record the unknowns vector.
    options['record_params'] :  bool(False)
        Tells recorder whether to record the params vector.
    options['record_resids'] :  bool(False)
        Tells recorder whether to record the ressiduals vector.
    options['record_derivs'] :  bool(True)
        Tells recorder whether to record derivatives that are requested by a `Driver`.
    options['includes'] :  list of strings
        Patterns for variables to include in recording.
    options['excludes'] :  list of strings
        Patterns for variables to exclude in recording (processed after includes).
    """

    def __init__(self, out, **sqlite_dict_args):
        super(SqliteRecorder, self).__init__()

        if MPI and MPI.COMM_WORLD.rank > 0 :
            self._open_close_sqlitedict = False
        else:
            self._open_close_sqlitedict = True

        if self._open_close_sqlitedict:
            sqlite_dict_args.setdefault('autocommit', True)
            self.out = SqliteDict(filename=out, flag='n', tablename='openmdao', **sqlite_dict_args)
            self.out_derivs = SqliteDict(filename=out, flag='w', tablename='openmdao_derivs', **sqlite_dict_args)

        else:
            self.out = None

    def record_metadata(self, group):
        """Stores the metadata of the given group in a sqlite file using
        the variable name for the key.

        Args
        ----
        group : `System`
            `System` containing vectors
        """

        params = group.params.iteritems()
        #resids = group.resids.iteritems()
        unknowns = group.unknowns.iteritems()

        data = OrderedDict([
                            ('format_version', format_version),
                            ('Parameters', dict(params)),
                            ('Unknowns', dict(unknowns)),
                            ])

        self.out['metadata'] = data

    def record_iteration(self, params, unknowns, resids, metadata):
        """
        Stores the provided data in the sqlite file using the iteration
        coordinate for the key.

        Args
        ----
        params : dict
            Dictionary containing parameters. (p)

        unknowns : dict
            Dictionary containing outputs and states. (u)

        resids : dict
            Dictionary containing residuals. (r)

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        data = OrderedDict()
        iteration_coordinate = metadata['coord']
        timestamp = metadata['timestamp']

        group_name = format_iteration_coordinate(iteration_coordinate)

        data['timestamp'] = timestamp
        data['success'] = metadata['success']
        data['msg'] = metadata['msg']

        if self.options['record_params']:
            data['Parameters'] = self._filter_vector(params, 'p', iteration_coordinate)

        if self.options['record_unknowns']:
            data['Unknowns'] = self._filter_vector(unknowns, 'u', iteration_coordinate)

        if self.options['record_resids']:
            data['Residuals'] = self._filter_vector(resids, 'r', iteration_coordinate)

        self.out[group_name] = data

    def record_derivatives(self, derivs, metadata):
        """Writes the derivatives that were calculated for the driver.

        Args
        ----
        derivs : dict or ndarray depending on the optimizer
            Dictionary containing derivatives

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        data = OrderedDict()
        iteration_coordinate = metadata['coord']
        timestamp = metadata['timestamp']

        group_name = format_iteration_coordinate(iteration_coordinate)

        data['timestamp'] = timestamp
        data['success'] = metadata['success']
        data['msg'] = metadata['msg']
        data['Derivatives'] = derivs

        self.out_derivs[group_name] = data

    def close(self):
        """Closes `out`"""

        if self._open_close_sqlitedict:
            if self.out is not None:
                self.out.close()
                self.out = None
            if self.out_derivs is not None:
                self.out_derivs.close()
                self.out_derivs = None

Beispiel #56

0

Datei anzeigen

Datei: sqlite_recorder.py Projekt: relf/OpenMDAO

class SqliteRecorder(BaseRecorder):
    """ Recorder that saves cases in an SQLite dictionary.

    Args
    ----
    sqlite_dict_args : dict
        Dictionary lf any additional arguments for the SQL db.

    Options
    -------
    options['record_metadata'] :  bool(True)
        Tells recorder whether to record variable attribute metadata.
    options['record_unknowns'] :  bool(True)
        Tells recorder whether to record the unknowns vector.
    options['record_params'] :  bool(False)
        Tells recorder whether to record the params vector.
    options['record_resids'] :  bool(False)
        Tells recorder whether to record the ressiduals vector.
    options['record_derivs'] :  bool(True)
        Tells recorder whether to record derivatives that are requested by a `Driver`.
    options['includes'] :  list of strings
        Patterns for variables to include in recording.
    options['excludes'] :  list of strings
        Patterns for variables to exclude in recording (processed after includes).
    """

    def __init__(self, out, **sqlite_dict_args):
        super(SqliteRecorder, self).__init__()

        if MPI and MPI.COMM_WORLD.rank > 0:
            self._open_close_sqlitedict = False
        else:
            self._open_close_sqlitedict = True

        if self._open_close_sqlitedict:
            sqlite_dict_args.setdefault("autocommit", True)
            sqlite_dict_args.setdefault("tablename", "openmdao")
            self.out = SqliteDict(filename=out, flag="n", **sqlite_dict_args)
        else:
            self.out = None

    def record_metadata(self, group):
        """Stores the metadata of the given group in a sqlite file using
        the variable name for the key.

        Args
        ----
        group : `System`
            `System` containing vectors
        """

        params = group.params.iteritems()
        resids = group.resids.iteritems()
        unknowns = group.unknowns.iteritems()

        data = OrderedDict([("Parameters", dict(params)), ("Unknowns", dict(unknowns))])

        self.out["metadata"] = data

    def record_iteration(self, params, unknowns, resids, metadata):
        """
        Stores the provided data in the sqlite file using the iteration
        coordinate for the key.

        Args
        ----
        params : dict
            Dictionary containing parameters. (p)

        unknowns : dict
            Dictionary containing outputs and states. (u)

        resids : dict
            Dictionary containing residuals. (r)

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        data = OrderedDict()
        iteration_coordinate = metadata["coord"]
        timestamp = metadata["timestamp"]

        group_name = format_iteration_coordinate(iteration_coordinate)

        data["timestamp"] = timestamp
        data["success"] = metadata["success"]
        data["msg"] = metadata["msg"]

        if self.options["record_params"]:
            data["Parameters"] = self._filter_vector(params, "p", iteration_coordinate)

        if self.options["record_unknowns"]:
            data["Unknowns"] = self._filter_vector(unknowns, "u", iteration_coordinate)

        if self.options["record_resids"]:
            data["Residuals"] = self._filter_vector(resids, "r", iteration_coordinate)

        self.out[group_name] = data

    def record_derivatives(self, derivs, metadata):
        """Writes the derivatives that were calculated for the driver.

        Args
        ----
        derivs : dict
            Dictionary containing derivatives

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        data = OrderedDict()
        iteration_coordinate = metadata["coord"]
        timestamp = metadata["timestamp"]

        group_name = format_iteration_coordinate(iteration_coordinate)
        group_name = "%s/derivs" % group_name

        data["timestamp"] = timestamp
        data["success"] = metadata["success"]
        data["msg"] = metadata["msg"]
        data["Derivatives"] = derivs

        self.out[group_name] = data

    def close(self):
        """Closes `out`"""

        if self._open_close_sqlitedict:
            if self.out is not None:
                self.out.close()
                self.out = None

Beispiel #57

0

Datei anzeigen

Datei: mailbox.py Projekt: inpos/serpent

class IMAPMailbox(ExtendedMaildir):
    implements(imap4.IMailbox, imap4.ICloseableMailbox)
    
    AppendFactory = SerpentAppendMessageTask

    def __init__(self, path):
        maildir.initializeMaildir(path)
        self.listeners = []
        self.path = path
        self.open_flags()
        self.lastadded = None
        self.__check_flags_()
    
    def open_flags(self):
        self.msg_info = SqliteDict(os.path.join(self.path, conf.imap_msg_info))
        self.mbox_info = SqliteDict(os.path.join(self.path, conf.imap_mbox_info))

    def _start_monitor(self):
        self.notifier = inotify.INotify()
        self.notifier.startReading()
        self.notifier.watch(filepath.FilePath(os.path.join(self.path, 'new')),
                   callbacks=[self._new_files])
        self.notifier.watch(filepath.FilePath(os.path.join(self.path,'cur')),
                   callbacks=[self._new_files])

    def _stop_monitor(self):
        self.notifier.stopReading()
        self.notifier.loseConnection()

    def _new_files(self, wo, path, code):
        if code == inotify.IN_MOVED_TO or code == inotify.IN_DELETE:
            for l in self.listeners:
                l.newMessages(self.getMessageCount(), self.getRecentCount())

    def __check_flags_(self):
        if 'subscribed' not in self.mbox_info.keys(): self.mbox_info['subscribed'] = False
        if 'flags' not in self.mbox_info.keys(): self.mbox_info['flags'] = []
        if 'special' not in self.mbox_info.keys(): self.mbox_info['special'] = ''
        if 'uidvalidity' not in self.mbox_info.keys(): self.mbox_info['uidvalidity'] = random.randint(0, 2**32)
        if 'uidnext' not in self.mbox_info.keys(): self.mbox_info['uidnext'] = 1
        #self.mbox_info.commit(blocking=False)    # XXX
        l = [l for l in self.__msg_list_()]
        for i in l:
            fn = i.split('/')[-1]
            if fn not in self.msg_info.keys():
                val1 = {'uid': self.getUIDNext()}
                if i.split('/')[-2] == 'new':
                    val1['flags'] = []
                else:
                    val1['flags'] = [misc.IMAP_FLAGS['SEEN']]
                self.msg_info[fn] = val1
        #self.msg_info.commit(blocking=False)    # XXX

    def subscribe(self):
        self.mbox_info['subscribed'] = True
        #self.mbox_info.commit(blocking=False)    # XXX

    def unsubscribe(self):
        self.mbox_info['subscribed'] = False
        #self.mbox_info.commit(blocking=False)    # XXX
    
    def is_subscribed(self):
        return self.mbox_info['subscribed']

    def __count_flagged_msgs_(self, flag):
        val1 = [0 for fn in self.msg_info.keys() if flag in self.msg_info[fn]['flags']]
        return len(val1)
    
    def getHierarchicalDelimiter(self):
        return misc.IMAP_HDELIM

    def setSpecial(self, special):
        self.mbox_info['special'] = special
        #self.mbox_info.commit(blocking=False)    # XXX

    def getFlags(self):
        return sorted(misc.IMAP_FLAGS.values())
    
    def getMboxFlags(self):
        f = list(self.mbox_info['flags'])
        if self.mbox_info['special'] != '': f.append(self.mbox_info['special'])
        return f
    
    def addFlag(self, flag):
        self.mbox_info['flags'] = list(set(self.mbox_info['flags']).union([flag]))
        #self.mbox_info.commit(blocking=False)    # XXX
    
    def removeFlag(self, flag):
        self.mbox_info['flags'] = list(set(self.mbox_info['flags']).difference([flag]))
        #self.mbox_info.commit(blocking=False)    # XXX
    
    def hasChildren(self):
        flags = self.getFlags()
        if misc.MBOX_FLAGS['HASCHILDREN'] not in flags:
            self.addFlag(misc.MBOX_FLAGS['HASCHILDREN'])
        if misc.MBOX_FLAGS['HASNOCHILDREN'] in flags:
            self.removeFlag(misc.MBOX_FLAGS['HASNOCHILDREN'])
    def hasNoChildren(self):
        flags = self.getFlags()
        if misc.MBOX_FLAGS['HASNOCHILDREN'] not in flags:
            self.addFlag(misc.MBOX_FLAGS['HASNOCHILDREN'])
        if misc.MBOX_FLAGS['HASCHILDREN'] in flags:
            self.removeFlag(misc.MBOX_FLAGS['HASCHILDREN'])

    def getMessageCount(self):
        val1 = [0 for fn in self.msg_info.keys() if misc.IMAP_FLAGS['DELETED'] not in self.msg_info[fn]['flags']]
        return len(val1)

    def getRecentCount(self):
        c = 0
        for fn in self.msg_info.keys():
            if misc.IMAP_FLAGS['RECENT'] in self.msg_info[fn]['flags']:
                c += 1
                info = self.msg_info[fn]
                info['flags'] = set(info['flags']).difference(set([misc.IMAP_FLAGS['RECENT']]))
                self.msg_info[fn] = info
        #self.msg_info.commit(blocking=False)    # XXX
        return c
    
    def getUnseenCount(self):
        return self.getMessageCount() - self.__count_flagged_msgs_(misc.IMAP_FLAGS['SEEN'])

    def isWriteable(self):
        return True

    def getUIDValidity(self):
        return self.mbox_info['uidvalidity']
    
    def getUIDNext(self):
        un = self.mbox_info['uidnext']
        self.mbox_info['uidnext'] += 1
        #self.mbox_info.commit(blocking=False)    # XXX
        return un
    
    def getUID(self, num):
        return num

    def addMessage(self, message, flags = (), date = None):
        return self.appendMessage(message).addCallback(self._cbAddMessage, flags)
    
    def _cbAddMessage(self, obj, flags):
        path = self.lastadded
        self.lastadded = None
        fn = path.split('/')[-1]
        self.msg_info[fn] = {'uid': self.getUIDNext(), 'flags': flags}
        #self.msg_info.commit(blocking=False)    # XXX
        if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur':
            new_path = os.path.join(self.path, 'cur', fn)
            os.rename(path, new_path)

    def __msg_list_(self):
        a = []
        for m in os.listdir(os.path.join(self.path, 'new')):
            a.append(os.path.join(self.path, 'new', m))
        for m in os.listdir(os.path.join(self.path, 'cur')):
            a.append(os.path.join(self.path, 'cur', m))
        return a

    def _seqMessageSetToSeqDict(self, messageSet):
        if not messageSet.last:
            messageSet.last = self.getMessageCount()

        seqMap = {}
        msgs = self.__msg_list_()
        for messageNum in messageSet:
            if messageNum > 0 and messageNum <= self.getMessageCount():
                seqMap[messageNum] = msgs[messageNum - 1]
        return seqMap

    def fetch(self, messages, uid):
        return [[seq, MaildirMessage(seq,
                                     file(filename, 'rb').read(),
                                     self.msg_info[filename.split('/')[-1]]['flags'],
                                     rfc822date())]
                for seq, filename in self.__fetch_(messages, uid).iteritems()]
    def __fetch_(self, messages, uid):
        if uid:
            messagesToFetch = {}
            if not messages.last:
                messages.last = self.mbox_info['uidnext']
            fn_uid = dict((fn, self.msg_info[fn]['uid']) for fn in self.msg_info.keys())
            for uid in messages:
                if uid in fn_uid.values():
                    for name, _id in fn_uid.iteritems():
                        if uid == _id:
                            if os.path.exists(os.path.join(self.path,'new', name)):
                                messagesToFetch[uid] = os.path.join(self.path,'new', name)
                            elif os.path.exists(os.path.join(self.path,'cur', name)):
                                messagesToFetch[uid] = os.path.join(self.path,'cur', name)
        else:
            messagesToFetch = self._seqMessageSetToSeqDict(messages)
        return messagesToFetch
    def store(self, messages, flags, mode, uid):
        d = {}
        for _id, path in self.__fetch_(messages, uid).iteritems():
            filename = path.split('/')[-1]
            if mode < 0:
                old_f = self.msg_info[filename]
                old_f['flags'] = list(set(old_f['flags']).difference(set(flags)))
                self.msg_info[filename] = old_f
                if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'new':
                    new_path = os.path.join(self.path, 'new', filename)
                    os.rename(path, new_path)
            elif mode == 0:
                old_f = self.msg_info[filename]
                old_f['flags'] = flags
                self.msg_info[filename] = old_f
            elif mode > 0:
                old_f = self.msg_info[filename]
                old_f['flags'] = list(set(old_f['flags']).union(set(flags)))
                self.msg_info[filename] = old_f
                if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur':
                    new_path = os.path.join(self.path, 'cur', filename)
                    os.rename(path, new_path)
            d[_id] = self.msg_info[filename]['flags']
        #self.msg_info.commit(blocking=False)    # XXX
        return d
    
    def expunge(self):
        uids = []
        for path in self.__msg_list_():
            fn = path.split('/')[-1]
            if fn not in self.msg_info.keys():
                continue
            uid = self.msg_info[fn]['uid']
            if misc.IMAP_FLAGS['DELETED'] in self.msg_info[fn]['flags']:
                os.remove(path)
                del self.msg_info[fn]
                uids.append(uid)
        #self.msg_info.commit(blocking=False)    # XXX
        return uids
    
    def addListener(self, listener):
        self.listeners.append(listener)
        return True

    def removeListener(self, listener):
        self.listeners.remove(listener)
        return True
    
    def requestStatus(self, names):
        return imap4.statusRequestHelper(self, names)
    
    def destroy(self):
        pass

    def close(self):
        print('!!! %s - %d !!!' % (self.path, len(self.listeners)))
        if len(self.listeners) == 0:
            self._stop_monitor() 
            if conf.imap_expunge_on_close:
                self.expunge()
            self.msg_info.commit(blocking=False)
            self.mbox_info.commit(blocking = False)
            self.msg_info.close()
            self.mbox_info.close()

Beispiel #58

0

Datei anzeigen

Datei: sqlite_recorder.py Projekt: fzahle/OpenMDAO

class SqliteRecorder(BaseRecorder):

    def __init__(self, out, **sqlite_dict_args):
        super(SqliteRecorder, self).__init__()

        if MPI and MPI.COMM_WORLD.rank > 0 :
            self._open_close_sqlitedict = False
        else:
            self._open_close_sqlitedict = True


        if self._open_close_sqlitedict:
            sqlite_dict_args.setdefault('autocommit', True)
            sqlite_dict_args.setdefault('tablename', 'openmdao')
            self.out = SqliteDict(filename=out, flag='n', **sqlite_dict_args)
        else:
            self.out = None

    def record_metadata(self, group):
        """Stores the metadata of the given group in a sqlite file using
        the variable name for the key.

        Args
        ----
        group : `System`
            `System` containing vectors 
        """

        params = group.params.iteritems()
        resids = group.resids.iteritems()
        unknowns = group.unknowns.iteritems()

        data = OrderedDict([('Parameters', dict(params)),
                            ('Unknowns', dict(unknowns)),
                            ])

        self.out['metadata'] = data

    def record_iteration(self, params, unknowns, resids, metadata):
        """
        Stores the provided data in the sqlite file using the iteration
        coordinate for the key.

        Args
        ----
        params : dict
            Dictionary containing parameters. (p)

        unknowns : dict
            Dictionary containing outputs and states. (u)

        resids : dict
            Dictionary containing residuals. (r)

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        data = OrderedDict()
        iteration_coordinate = metadata['coord']
        timestamp = metadata['timestamp']
        params, unknowns, resids = self._filter_vectors(params, unknowns, resids, iteration_coordinate)

        group_name = format_iteration_coordinate(iteration_coordinate)
        
        data['timestamp'] = timestamp

        if self.options['record_params']:
            data['Parameters'] = params

        if self.options['record_unknowns']:
            data['Unknowns'] = unknowns

        if self.options['record_resids']:
            data['Residuals'] = resids

        self.out[group_name] = data

    def close(self):
        """Closes `out`"""

        if self._open_close_sqlitedict:
            if self.out is not None:
                self.out.close()
                self.out = None

Beispiel #59

0

Datei anzeigen

Datei: FuncV3.py Projekt: YangZunYu/FermiNLP

def reset(texts, index_dic=True, tfidf=True, hdp=False, lda=True, sim=False):
    total_start = timeit.default_timer()
    make_index_time = 0
    make_dict_time = 0
    make_lda_time = 0
    make_tfidf_time = 0
    sim_time = 0
    hdptopicnum = 0

    if index_dic:
        f = [i.split(',') for i in texts.readlines()]
        logging.info('Create id & ac_id list')
        ids = [f[i][1] for i in range(len(f))]
        ac_ids = [f[i][0] for i in range(len(f))]
        logging.info('Create contents list')
        contents = []
        for i in range(len(f)):
            if len(f[i]) == 3:
                contents.append(f[i][2].strip().split(':'))
            else:
                contents.append([])

        # make index
        logging.info('***********Now Make Index by sqlitedict***********')
        timer_start = timeit.default_timer()
        pos2paid = zip(range(len(f)), ac_ids)
        paid2pos_rel = {}
        for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)):
            paid2pos_rel.update({int(key): [i[0] for i in paid]})
        id2pos_rel = dict(zip(ids, range(len(f))))
        pos2id_rel = dict(zip(range(len(f)), ids))

        id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True)
        id2pos.clear()
        id2pos.update(id2pos_rel)
        id2pos.close()
        pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True)
        pos2id.clear()
        pos2id.update(pos2id_rel)
        pos2id.close()
        paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True)
        paid2pos.clear()
        paid2pos.update(paid2pos_rel)
        paid2pos.close()
        timer_end = timeit.default_timer()
        make_index_time = timer_end - timer_start

        # make dict
        logging.info('***********Now Make Dictionary***********')
        timer_start = timeit.default_timer()
        dic = corpora.Dictionary(contents)
        ############## optimized dictionary
        dic.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        ##############
        dic.save(gl.res + '/resource/dict')
        timer_end = timeit.default_timer()
        make_dict_time = timer_end - timer_start

        # make corpus
        logging.info('***********Now Make Corpus***********')

        temps = []
        for i, t in enumerate(contents):
            temps.append(dic.doc2bow(t))
            if i % 10000 == 0:
                logging.info('make corpus ' + str(i) + ' articles')
        corpus = temps
        corpora.MmCorpus.serialize(gl.res + '/resource/corpus', corpus)

    if tfidf:
        # do tfidf train
        logging.info('***********Now Training TF-IDF Model***********')
        timer_start = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        tfidf = models.TfidfModel(corpus)
        tfidf.save(gl.res + '/resource/tfidf')

        timer_end = timeit.default_timer()
        make_tfidf_time = timer_end - timer_start

    if hdp:
        gc.collect()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        hdpmodel = models.hdpmodel.HdpModel(corpus, id2word=dic)
        hdptopicnum = len(hdpmodel.print_topics(topics=-1, topn=10))
        logging.info('hdptopicnum is {}'.format(hdptopicnum))

    if lda:
        # do lda train
        gc.collect()
        tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf')
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        corpus_tfidf = tfidf[corpus]
        logging.info('***********Now Training LDA Model***********')
        timer_start = timeit.default_timer()
        if not hdptopicnum == 0:
            gl.topicCount = hdptopicnum
        lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
                                  num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes)
        # lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
        #                       num_topics=gl.topicCount, passes=gl.lda_passes, distributed=True)
        lda.save(gl.res + '/resource/lda')
        timer_end = timeit.default_timer()
        make_lda_time = timer_end - timer_start
        logging.info('lda training cost %.2f seconds' % make_lda_time)

    if sim:
        gc.collect()
        logging.info('***********Now Make Similarity Index***********')
        st = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        lda = models.LdaModel.load(gl.res + '/resource/lda')
        index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount)
        index.save(gl.res + '/resource/simIndex')
        sim_time = timeit.default_timer() - st

    total_end = timeit.default_timer()
    total_time = total_end - total_start
    m = divmod(total_time, 60)
    h = divmod(m[0], 60)
    logging.info('\nReset LDA Model complete!!!\n'
                 '***Using time*** \n'
                 'index training    {:.2f}\n'
                 'dict training     {:.2f}\n'
                 'tfidf training    {:.2f}\n'
                 'lda training      {:.2f}\n'
                 'sim training      {:.2f}\n'
                 'Total time:       {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time,
                                                                make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))

    basicConfig = open(gl.res + '/resource/basicConfig.txt', mode='w+')
    basicConfig.write('FileName: {}'
                      '\nTopicNumber = {}'
                      '\nestTopicNumber = {}'
                      '\nldaPasses = {}'
                      .format(os.path.basename(texts.name), gl.topicCount, hdptopicnum, gl.lda_passes))
    basicConfig.close()

Beispiel #60

0

Datei anzeigen

Datei: FuncV3.py Projekt: YangZunYu/FermiNLP

def merge(texts, index_dic=True, tfidf=True, lda=True, sim=False):
    total_start = timeit.default_timer()
    make_index_time = 0
    make_dict_time = 0
    make_lda_time = 0
    make_tfidf_time = 0
    sim_time = 0

    if index_dic:
        f = [i.split(',') for i in texts.readlines()]
        logging.info('Create id & ac_id list')
        ids = [f[i][0] for i in range(len(f))]
        ac_ids = [f[i][1] for i in range(len(f))]
        logging.info('Create contents list')
        contents = []
        for i in range(len(f)):
            if len(f[i]) == 3:
                contents.append(f[i][2].strip().split(':'))
            else:
                contents.append([])

        # make index
        logging.info('***********Now merge index by sqlitedict***********')
        timer_start = timeit.default_timer()
        old_corpus_len = len(corpora.MmCorpus(gl.res + '/resource/corpus'))
        pos2paid = zip(range(old_corpus_len, old_corpus_len + len(f)), ac_ids)
        paid2pos_new = {}
        for key, paid in groupby(sorted(pos2paid, key=itemgetter(1)), key=itemgetter(1)):
            paid2pos_new.update({int(key): [i[0] for i in paid]})
        id2pos_new = dict(zip(ids, range(old_corpus_len, old_corpus_len + len(f))))
        pos2id_new = dict(zip(range(old_corpus_len, old_corpus_len + len(f)), ids))

        id2pos = SqliteDict(filename=gl.res + '/resource/id2pos', autocommit=True)
        id2pos.update(id2pos_new)
        id2pos.close()
        pos2id = SqliteDict(filename=gl.res + '/resource/pos2id', autocommit=True)
        pos2id.update(pos2id_new)
        pos2id.close()
        paid2pos = SqliteDict(filename=gl.res + '/resource/paid2pos', autocommit=True)
        x = [set(paid2pos_new.keys()), set([int(i) for i in paid2pos.iterkeys()])]
        for i in list(set.intersection(*x)):  # update duplicate key
            temp = list(chain(paid2pos[i], paid2pos_new[i]))
            paid2pos.update({int(i): temp})
        paid2pos.close()
        timer_end = timeit.default_timer()
        make_index_time = timer_end - timer_start

        # Merge dictionary
        logging.info('***********Now merge Dictionary***********')
        timer_start = timeit.default_timer()
        newDict = corpora.Dictionary(contents)
        newDict.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        dic.merge_with(newDict)
        dic.save(gl.res + '/resource/dict')
        timer_end = timeit.default_timer()
        make_dict_time = timer_end - timer_start

        # merge corpus
        logging.info('***********Now merge Corpus***********')
        temps = []
        for i, t in enumerate(contents):
            temps.append(dic.doc2bow(t))
            if i % 10000 == 0:
                logging.info('make corpus ' + str(i) + ' articles')
        corpora.MmCorpus.serialize(gl.res + '/resource/new_c', temps)
        gc.collect()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        new_corpus = corpora.MmCorpus(gl.res + '/resource/new_c')
        merged_corpus = chain(corpus, new_corpus)
        corpora.MmCorpus.serialize(gl.res + '/resource/merged_c', merged_corpus)  # Overwrite corpus

        for filename in glob.glob(gl.res + '/resource/*'):
            if filename.endswith('corpus') or filename.endswith('corpus.index') \
                    or filename.endswith('new_c') or filename.endswith('new_c.index'):  # rm useless corpus
                # os.remove(filename)
                os.unlink(filename)
            if filename.endswith('merged_c'):  # rename to corpus
                os.rename(filename, gl.res + '/resource/corpus')
            if filename.endswith('merged_c.index'):
                os.rename(filename, gl.res + '/resource/corpus.index')

    if tfidf:
        # do tfidf merge
        gc.collect()
        logging.info('***********Now merge TF-IDF model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('tfidf'):
                os.rename(filename, filename + '_' + gl.c_time)
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')  # reload corpus
        tfidf = models.TfidfModel(corpus)
        tfidf.save(gl.res + '/resource/tfidf')
        timer_end = timeit.default_timer()
        make_tfidf_time = timer_end - timer_start

    if lda:
        # do lda merge
        gc.collect()
        tfidf = models.TfidfModel.load(gl.res + '/resource/tfidf')
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        corpus_tfidf = tfidf[corpus]
        dic = corpora.Dictionary.load(gl.res + '/resource/dict')
        logging.info('***********Now merge LDA model***********')
        timer_start = timeit.default_timer()
        for filename in glob.glob(gl.res + '/resource/*'):  # backup old model
            if filename.endswith('lda') or filename.endswith('lda.state'):
                os.rename(filename, filename + '_' + gl.c_time)
        # lda = models.LdaMulticore(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
        #                           num_topics=gl.topicCount, workers=gl.workers, passes=gl.lda_passes)
        lda = models.LdaModel(corpus_tfidf, id2word=dic, chunksize=gl.chunksize,
                              num_topics=gl.topicCount, passes=gl.lda_passes)
        lda.save(gl.res + '/resource/lda')
        timer_end = timeit.default_timer()
        make_lda_time = timer_end - timer_start
        logging.info('lda training cost %.2f seconds' % make_lda_time)

    if sim:
        gc.collect()
        logging.info('***********Now Make Similarity Index***********')
        st = timeit.default_timer()
        corpus = corpora.MmCorpus(gl.res + '/resource/corpus')
        lda = models.LdaModel.load(gl.res + '/resource/lda')
        index = similarities.MatrixSimilarity(lda[corpus], num_features=gl.topicCount)
        index.save(gl.res + '/resource/simIndex')
        sim_time = timeit.default_timer() - st

    total_end = timeit.default_timer()
    total_time = total_end - total_start
    m = divmod(total_time, 60)
    h = divmod(m[0], 60)
    logging.info('\nMerge LDA Model complete!!!\n'
                 '***Using time*** \n'
                 'index training    {:.2f}\n'
                 'dict training     {:.2f}\n'
                 'tfidf training    {:.2f}\n'
                 'lda training      {:.2f}\n'
                 'sim training      {:.2f}\n'
                 'Total time:       {:d}h {:d}m {:.2f}s'.format(make_index_time, make_dict_time, make_tfidf_time,
                                                                make_lda_time, sim_time, int(h[0]), int(h[1]), m[1]))