Python SqliteDict.keysの例、sqlitedict.SqliteDict.keys Pythonの例

コード例 #1

0

ファイルを表示

ファイル: filenames_database.py プロジェクト: afrazkhan/cstash

    def search(self, obj, exact=False, db=None):
        """
        Search the database for partial matches of [obj], and return a list of matches
        in the tuple form:

        ("obj", { "filename_hash": string,
                  "cryptographer": string,
                  "key": string,
                  "storage_provider": string,
                  "bucket": string,
                  "file_hash": string } )

        If [exact] == True, then only exact matches will be returned. Since there should
        only ever be a single exact match for a path in the DB, a CstashCriticalException
        will be thrown if more than a single element is in the resulting list. This shouldn't
        be possible anyway, since the DB is a key/value store, but it's a safety measure.
        """

        db = db or self.db
        db_connection = SqliteDict(db, autocommit=True, flag='r')
        if exact is True:
            keys = [(k, db_connection[k]) for k in db_connection.keys() if obj == k]
        elif obj is None:
            keys = [(k, db_connection[k]) for k in db_connection.keys()]
        else:
            keys = [(k, db_connection[k]) for k in db_connection.keys() if obj in k]

        if exact is True and len(keys) > 1:
            raise exceptions.CstashCriticalException(message=(f"Found more than a single match "
                "for {obj} in the database:\n\n{keys}")) # pylint: disable=bad-continuation

        db_connection.close()

        return keys

コード例 #2

0

ファイルを表示

ファイル: efa.py プロジェクト: deti/efa

def adjust_evernote_font():
    """
    Call for Evernote
    """
    note_info = SqliteDict(conf.db.db_file, autocommit=True)

    notes_in_evernote = list()
    for note in get_notes(get_notebooks()):
        guid = note.guid
        notes_in_evernote.append(guid)
        if guid not in note_info.keys() \
                or note_info[guid][FONT_SIZE] != conf.font_size \
                or note_info[guid][LINE_HEIGHT] != conf.line_height:
            adjust_note(note)
            note_info[guid] = {FONT_SIZE: conf.font_size,
                               LINE_HEIGHT: conf.line_height}

    guids_to_forget = [guid for guid in note_info.keys()
                       if guid not in notes_in_evernote]

    for guid in guids_to_forget:
        logging.debug("Delete guid from DB: {}".format(guid))
        del note_info[guid]

    note_info.close()

コード例 #3

0

ファイルを表示

class ImageCache:
    def __init__(self, directory):
        db_dirname = os.path.join(directory, ".mikula")
        if not os.path.isdir(db_dirname):
            os.mkdir(db_dirname)
        db_filename = os.path.join(db_dirname, "images.cache")
        self.cache = SqliteDict(db_filename)
        self.recent_lookup_ = None

    def reset(self):
        self.cache.clear()
        self.recent_lookup_ = None

    def config_changed(self, config):
        if "config" not in self.cache.keys():
            return True
        stored = self.cache["config"]
        return stored != config

    def update_config(self, config):
        self.cache["config"] = config
        self.cache.commit()

    def require_update(self, filename):
        if filename not in self.cache.keys():
            self.recent_lookup_ = None
            return True

        timestamp, scaled, scaled_time, thumbnail, thumbnail_time = self.cache[
            filename]
        if os.path.exists(scaled) and os.path.exists(thumbnail):
            if os.path.getmtime(filename) == timestamp and \
               os.path.getmtime(scaled) == scaled_time and \
               os.path.getmtime(thumbnail) == thumbnail_time:
                self.recent_lookup_ = (scaled, thumbnail)
                return False
        self.recent_lookup_ = None
        return True

    def get_filenames(self):
        return self.recent_lookup_

    def update(self, filename, scaled, thumbnail):
        timestamp = os.path.getmtime(filename)
        scaled_time = os.path.getmtime(scaled)
        thumbnail_time = os.path.getmtime(thumbnail)
        self.cache[filename] = (timestamp, scaled, scaled_time, thumbnail,
                                thumbnail_time)
        self.cache.commit()

コード例 #4

0

ファイルを表示

ファイル: negative_sampling_test.py プロジェクト: uhh-lt/kb2vec

def get_statistics_true_url(positives_negatives, urls_db):
    db = SqliteDict(urls_db, autocommit=False)
    urls = list(db.keys())

    file = codecs.open('candidates_without_true_name1_.tsv', 'a')
    count_exist = 0
    count_all = 0
    count_not_included = 0

    for positive_negative in positives_negatives:
        entity, beg, end, true_url, context, negative_samples = positive_negative

        samples = list()
        for negative_sample in negative_samples:
            samples.append(negative_sample.strip())

        if true_url in samples:
            count_exist += 1
        elif true_url in urls:
            file.write(str(entity) + '\t' + str(true_url) + '\n')
        else:
            count_not_included += 1

        count_all += 1

    print(count_exist)
    print(count_all)
    print(count_not_included)
    return float(count_exist) / count_all

コード例 #5

0

ファイルを表示

ファイル: sqllite.py プロジェクト: wonabru/chainnet

class CDataBase(object):
    def __init__(self):
        try:
            self.close()
        except:
            pass
        self.mydict = SqliteDict('./DB/my_db.sqlite', autocommit=True)
        self.show()

    def set(self, key, value):
        self.mydict[key] = value

    def get(self, key):
        if key in self.mydict.keys():
            ret = self.mydict[key]
        else:
            ret = None
        return ret

    def show(self, start_with=''):
        for key, value in self.mydict.items():
            if key.find(start_with):
                print(key, '\t', value, '\n')

    def clear(self):
        self.mydict.clear()

    def close(self):
        self.mydict.close()

コード例 #6

0

ファイルを表示

def create_completely_random(urls_db, contexts, phrases, n=10):
    samples = list()

    db = SqliteDict(urls_db, autocommit=False)
    url_keys = db.keys()
    urls = [url for url in url_keys]

    keys = phrases.keys()
    count = 0

    print(len(keys))

    for key in keys:
        if count % 100 == 0:
            print(count)
        count += 1
        entity, beg, end, true_url = phrases[key]
        negatives = [(entity, beg, end, true_url, contexts[key])]

        shuffle(urls)

        negative_samples = urls[:n]
        negatives.extend(negative_samples)
        samples.append(negatives)

    return samples

コード例 #7

0

ファイルを表示

def create_keywords_from_url(url_db):
    url_keywords = dict()

    db = SqliteDict(url_db, autocommit=False)
    urls = db.keys()
    lemmatizer = WordNetLemmatizer()

    for url in urls:
        entity = url.split('/')[-1]

        if "–" in entity:
            words = list()
            first_words = entity.split('_')
            for word in first_words:
                words.extend(word.split('–'))
        elif "-" in entity:
            words = list()
            first_words = entity.split('_')
            for word in first_words:
                words.extend(word.split('-'))
        else:
            words = entity.split('_')

        keywords = set()
        for word in words:
            prepro = word.strip(',').strip('.').strip('(').strip(')').lower()
            #keywords.add(prepro)
            keywords.add(lemmatizer.lemmatize(prepro))

        url_keywords[url] = keywords

    db.close()

    return url_keywords

コード例 #8

0

ファイルを表示

ファイル: api.py プロジェクト: agking10/IR_FinalProject

def clear_db(db_path_shadow: str) -> None:
    doc_vecs_db = SqliteDict(db_path_shadow)
    print("Clearing db {}".format(db_path_shadow))
    for key in tqdm(doc_vecs_db.keys()):
        del doc_vecs_db[key]
    doc_vecs_db.commit()
    doc_vecs_db.close()

コード例 #9

0

ファイルを表示

class DiskQueue():
    DIR_PATH = './diskqueue'
    IN_PROGRESS_DB_NAME = 'inprogress.sqlite'
    TODO_DB_NAME = 'todo.sqlite'
    SEEN_DB_NAME = 'seen.sqlite'

    def __init__(self, load: bool = False):
        self.iterLock = threading.Lock()

        if not os.path.exists(self.DIR_PATH):
            os.makedirs(self.DIR_PATH)

        if not load:
            for path in [self.IN_PROGRESS_DB_NAME, self.TODO_DB_NAME, self.SEEN_DB_NAME]:
                try:
                    os.remove('{}/{}'.format(self.DIR_PATH, path))
                except:
                    continue

        self.inProgress = SqliteDict('{}/{}'.format(self.DIR_PATH, self.IN_PROGRESS_DB_NAME), autocommit=True)
        self.todo = SqliteDict('{}/{}'.format(self.DIR_PATH, self.TODO_DB_NAME), autocommit=True)
        self.seen = SqliteDict('{}/{}'.format(self.DIR_PATH, self.SEEN_DB_NAME), autocommit=True)

        # If we need to load state, add everything that was in progress to the todo queue
        if load:
            for key in self.inProgress.iterkeys():
                self.todo[key] = True
                del self.inProgress[key]

    def Push(self, key):
        if (key not in self.todo) and (key not in self.inProgress) and (key not in self.seen):
            self.todo[key] = True

    def Next(self):
        toReturn = None
        with self.iterLock:
            toReturn = next(self.todo.keys(), None)
            if toReturn:
                self.inProgress[toReturn] = True
                del self.todo[toReturn]
        return toReturn

    def Done(self, key):
        self.seen[key] = True
        del self.inProgress[key]

    def Close(self):
        self.inProgress.close()
        self.todo.close()
        self.seen.close()

    def IsDone(self):
        tmp = False
        with self.iterLock:
            tmp = len(self.todo) == 0 and len(self.inProgress) == 0
        return tmp

コード例 #10

0

ファイルを表示

    def tag(self, text, label):
        db = SqliteDict(self.db, tablename='labels')
        if label not in db.keys():
            raise Exception('Label not found check your labels')

        with open(self.f, 'a+') as file:
            line = text + '\t' + label + '\n'
            file.write(line)
        self.done()
        return True

コード例 #11

0

ファイルを表示

ファイル: test_core.py プロジェクト: RaRe-Technologies/sqlitedict

    def test_default_reuse_existing_flag_c(self):
        """Re-opening of a database does not destroy it."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname)
        orig_db['key'] = 'value'
        orig_db.commit()
        orig_db.close()

        next_db = SqliteDict(filename=fname)
        self.assertIn('key', next_db.keys())
        self.assertEqual(next_db['key'], 'value')

コード例 #12

0

ファイルを表示

ファイル: test_core.py プロジェクト: RaRe-Technologies/sqlitedict

    def test_overwrite_using_flag_n(self):
        """Re-opening of a database with flag='c' destroys it all."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname, tablename='sometable')
        orig_db['key'] = 'value'
        orig_db.commit()
        orig_db.close()

        # verify,
        next_db = SqliteDict(filename=fname, tablename='sometable', flag='n')
        self.assertNotIn('key', next_db.keys())

コード例 #13

0

ファイルを表示

    def test_default_reuse_existing_flag_c(self):
        """Re-opening of a database does not destroy it."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname)
        orig_db['key'] = 'value'
        orig_db.commit()
        orig_db.close()

        next_db = SqliteDict(filename=fname)
        self.assertIn('key', next_db.keys())
        self.assertEqual(next_db['key'], 'value')

コード例 #14

0

ファイルを表示

    def test_overwrite_using_flag_n(self):
        """Re-opening of a database with flag='c' destroys it all."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db = SqliteDict(filename=fname, tablename='sometable')
        orig_db['key'] = 'value'
        orig_db.commit()
        orig_db.close()

        # verify,
        next_db = SqliteDict(filename=fname, tablename='sometable', flag='n')
        self.assertNotIn('key', next_db.keys())

コード例 #15

0

ファイルを表示

ファイル: test_core.py プロジェクト: RaRe-Technologies/sqlitedict

    def test_overwrite_using_flag_w(self):
        """Re-opening of a database with flag='w' destroys only the target table."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db_1 = SqliteDict(filename=fname, tablename='one')
        orig_db_1['key'] = 'value'
        orig_db_1.commit()
        orig_db_1.close()

        orig_db_2 = SqliteDict(filename=fname, tablename='two')
        orig_db_2['key'] = 'value'
        orig_db_2.commit()
        orig_db_2.close()

        # verify, when re-opening table space 'one' with flag='2', we destroy
        # its contents.  However, when re-opening table space 'two' with
        # default flag='r', its contents remain.
        next_db_1 = SqliteDict(filename=fname, tablename='one', flag='w')
        self.assertNotIn('key', next_db_1.keys())

        next_db_2 = SqliteDict(filename=fname, tablename='two')
        self.assertIn('key', next_db_2.keys())

コード例 #16

0

ファイルを表示

    def test_overwrite_using_flag_w(self):
        """Re-opening of a database with flag='w' destroys only the target table."""
        # given,
        fname = norm_file('tests/db/sqlitedict-override-test.sqlite')
        orig_db_1 = SqliteDict(filename=fname, tablename='one')
        orig_db_1['key'] = 'value'
        orig_db_1.commit()
        orig_db_1.close()

        orig_db_2 = SqliteDict(filename=fname, tablename='two')
        orig_db_2['key'] = 'value'
        orig_db_2.commit()
        orig_db_2.close()

        # verify, when re-opening table space 'one' with flag='2', we destroy
        # its contents.  However, when re-opening table space 'two' with
        # default flag='r', its contents remain.
        next_db_1 = SqliteDict(filename=fname, tablename='one', flag='w')
        self.assertNotIn('key', next_db_1.keys())

        next_db_2 = SqliteDict(filename=fname, tablename='two')
        self.assertIn('key', next_db_2.keys())

コード例 #17

0

ファイルを表示

def load_embeddings(embeddings_path):
    """Loads pre-trained word embeddings from tsv file.

    Args:
      embeddings_path - path to the embeddings file.

    Returns:
      embeddings - dict mapping words to vectors;
      embeddings_dim - dimension of the vectors.
    """
    embeddings = SqliteDict(RESOURCE_PATH['WORD_EMBEDDINGS'])
    embeddings_dim = embeddings[next(embeddings.keys())].shape[0]

    return embeddings, embeddings_dim

コード例 #18

0

ファイルを表示

def search_for_query(query, sim_thresh=0.8) -> str:
    query_map = SqliteDict(query_map_path)
    max_sim_query = ""
    max_sim = 0
    for key in query_map.keys():
        sim = sim_query(key, query)
        if sim > max_sim:
            max_sim = sim
            max_sim_query = key
    query_map.close()
    if max_sim > sim_thresh:
        return max_sim_query
    else:
        return ""

コード例 #19

0

ファイルを表示

ファイル: dcl.py プロジェクト: dailing/mask_rcnn

def predict(config):
    device = torch.device('cuda')
    loader = DataLoader(TrainEvalDataset(
        config.dataset(split='train', **config.dataset_parameter), config),
                        1000,
                        False,
                        num_workers=num_processor)
    test_loader = DataLoader(TrainEvalDataset(
        config.dataset(split='test', **config.dataset_parameter), config),
                             1000,
                             False,
                             num_workers=num_processor)
    net = NetModel(config.net)
    net = net.to(device)

    storage_dict = SqliteDict(f'{config.output_dir}/dcl_snap.db')
    if len(storage_dict) > 0:
        kk = list(storage_dict.keys())
        if config.predict.load_epoach == -1:
            config.predict.load_epoach = kk[-1]
        net.load_state_dict(torch.load(BytesIO(
            storage_dict[config.predict.load_epoach]),
                                       map_location=device),
                            strict=True)
        logger.info(f'loading from epoach {config.predict.load_epoach}')

    net.eval()

    outputs = {}
    keys = ['softmax']
    if config.predict.values_to_save is not None and len(
            config.predict.values_to_save) > 0:
        keys = config.predict.values_to_save
    for k in keys:
        outputs[k] = []
    for batch_cnt, batch in tqdm(enumerate(chain(loader, test_loader)),
                                 total=len(loader) + len(test_loader)):
        image, label = batch
        image = image.to(device)
        net_out = net(image)
        net_out.update(label)
        out = {}
        for k in keys:
            outputs[k].append(net_out[k].detach().cpu())
    # record = {}
    for k in keys:
        outputs[k] = torch.cat(outputs[k])
    pickle.dump((outputs), open(f'{config.output_dir}/predict.pkl', 'wb'))

コード例 #20

0

ファイルを表示

    def test_special_keys(self):
        """integer, float and/or tuple keys"""
        db = SqliteDict()
        db['1'] = 1
        db[1] = 'ONE'
        db[('a', 1)] = 'testtuple'
        db[frozenset([1, 2, '2'])] = 'testfrozenset'
        assert db[1] == 'ONE'
        assert db['1'] == 1
        assert db[('a', 1)] == 'testtuple'
        assert db[frozenset([1, 2, '2'])] == 'testfrozenset'

        # This tests the reverse conversion
        keys = list(db.keys())
        assert len(keys) == 4
        assert '1' in keys
        assert 1 in keys
        assert ('a', 1) in keys
        assert frozenset([1, 2, '2']) in keys

コード例 #21

0

ファイルを表示

class Database:
    def __init__(self):
        self.db = SqliteDict("database.sqlite")

        if "echoes" not in self.db.keys():
            self.db["echoes"] = []
            self.db.commit()

    def add_to_echoes(self, echo_text: str):
        if echo_text not in self.db["echoes"]:
            self.db["echoes"] += [echo_text]
            self.db.commit()

    def get_random_echo(self) -> str:
        try:
            return choice(self.db["echoes"])
        except IndexError:
            return "Congrats, that's the first message in my database!"

    def get_stats(self) -> str:
        return f"Message count: {len(self.db['echoes'])}"

    def get_dump(self) -> BytesIO:
        dump_bytes = BytesIO(
            bytes("\n".join(self.db['echoes']), encoding="utf-8"))
        dump_bytes.name = "dump.txt"
        return dump_bytes

    def overwrite(self, ow_bytes: BytesIO):
        ow_bytes.seek(0)
        ow_list = ow_bytes.read().decode("utf-8").split("\n")
        self.db["echoes"] = ow_list
        self.db.commit()

    def remove_echo(self, echo_text: str) -> str:
        try:
            self.db["echoes"].remove(echo_text)
            self.db.commit()
            return "Successfully removed from my database!"
        except ValueError:
            return "That text isn't in my database!"

コード例 #22

0

ファイルを表示

ファイル: cache.py プロジェクト: venkataravi7/gramex

class SQLiteStore(KeyStore):
    '''
    A KeyStore that stores data in a SQLite file. Typical usage::

        >>> store = SQLiteStore('file.db', table='store')
        >>> value = store.load(key)
        >>> store.dump(key, value)

    Values are encoded as JSON using gramex.config.CustomJSONEncoder (thus
    handling datetime.) Keys are JSON encoded.
    '''
    def __init__(self, path, table='store', *args, **kwargs):
        super(SQLiteStore, self).__init__(*args, **kwargs)
        self.path = _create_path(path)
        from sqlitedict import SqliteDict
        self.store = SqliteDict(
            self.path,
            tablename=table,
            autocommit=True,
            encode=lambda v: json.dumps(v,
                                        separators=(',', ':'),
                                        ensure_ascii=True,
                                        cls=CustomJSONEncoder),
            decode=lambda v: json.loads(
                v, object_pairs_hook=AttrDict, cls=CustomJSONDecoder),
        )

    def close(self):
        self.store.close()

    def flush(self):
        super(SQLiteStore, self).flush()
        self.store.commit()

    def keys(self):
        # Keys need to be escaped
        return (self._escape(key) for key in self.store.keys())

    def purge(self):
        app_log.debug('Purging %s', self.path)
        super(SQLiteStore, self).purge()

コード例 #23

0

ファイルを表示

ファイル: learner.py プロジェクト: yattom/mancala

    def stat(dbname):
        database = SqliteDict(dbname)
        print(f"{len(database)} records")
        counts = {}
        l = 0
        for key in database.keys():
            player, stones = eval(key)
            val = database[key]
            for hole_idx in val:
                diff_total, count = val[hole_idx]
                if count not in counts:
                    counts[count] = 0
                counts[count] += 1

#            l += 1
#            if l > 10:
#                break

        print("distribution of moves ever played")
        for a, b in sorted(list(counts.items()), key = lambda v: v[1]):
            print(f"  {b} {a}")

コード例 #24

0

ファイルを表示

ファイル: data_helper.py プロジェクト: uhh-lt/kb2vec

def create_db_from_dictdb(lookup_db_path, longabs_db_path, labels_db_path,
                          db_name):
    connection = sqlite3.connect(db_name)
    cursor = connection.cursor()

    cursor.execute(
        '''CREATE TABLE graph (node_id INTEGER PRIMARY KEY NOT NULL, long_abstracts TEXT, labels TEXT)'''
    )

    connection.commit()

    lookup_db = SqliteDict(lookup_db_path, autocommit=False)
    longabs_db = SqliteDict(longabs_db_path, autocommit=False)
    labels_db = SqliteDict(labels_db_path, autocommit=False)

    intersection_nodes = lookup_db.keys()

    count = 0

    for node in intersection_nodes:
        longab = longabs_db[node]
        label = labels_db[node]
        id = lookup_db[node]

        cursor.execute('''INSERT INTO graph VALUES (?,?,?)''',
                       (id, longab, label))

        if count % 100000 == 0:
            print(count)
            connection.commit()

        count += 1

    connection.commit()

    connection.close()
    lookup_db.close()
    labels_db.close()
    longabs_db.close()

コード例 #25

0

ファイルを表示

def filter_negative_samples_randomly(positives_negatives, url_db, n=10):
    filtered_samples = list()
    db = SqliteDict(url_db, autocommit=False)

    keys = db.keys()
    urls = [key for key in keys]

    for positive_negative in positives_negatives:
        entity, beg, end, true_url, context, negative_samples = positive_negative[0], int(positive_negative[1]), int(positive_negative[2]), \
                                                                positive_negative[3], positive_negative[4], positive_negative[5:][0]

        try:
            # check if db contains this url or not.
            id = db[true_url]
        except KeyError:
            # if not, skip it.
            continue

        addition = list()

        # if it does not have negative samples, then completely random samples are created.
        length = len(negative_samples)
        if length == 0:
            negative_samples = urls
        elif length < n:
            shuffle(urls)
            size = n - len(negative_samples)
            addition = urls[:size]

        shuffle(negative_samples)

        filtered_sample = [(entity, beg, end, true_url, context)]
        negative_samples.extend(addition)
        filtered_sample.extend(negative_samples[:n])
        filtered_samples.append(filtered_sample)

    return filtered_samples

コード例 #26

0

ファイルを表示

ファイル: construct_graph.py プロジェクト: uhh-lt/kb2vec

    def create_nodes_from_db(self,
                             longabsdb_path,
                             labelsdb_path,
                             lookupdb_path,
                             subnodes=False):
        longabsdb = SqliteDict(longabsdb_path, autocommit=False)
        labelsdb = SqliteDict(labelsdb_path, autocommit=False)
        lookupdb = SqliteDict(lookupdb_path, autocommit=False)

        if subnodes:
            urls = subnodes
        else:
            urls = lookupdb.keys()

        count = 0
        for url in urls:
            # long abstract is string.
            long_abstract = longabsdb[url]
            # title is string.
            title = labelsdb[url]
            # node id is the integer value.
            node_id = int(lookupdb[url])

            # id, url, long abstract (text), and title are attributes.
            self._G.add_node(node_id,
                             id=node_id,
                             url=url,
                             long_abstract=long_abstract,
                             title=title)
            if count % 100000 == 0:
                self._logger.info(str(count) + ' nodes are processed..')

            count += 1

        longabsdb.close()
        labelsdb.close()
        lookupdb.close()

コード例 #27

0

ファイルを表示

ファイル: data_utils.py プロジェクト: mssammon/lorelei-eng-edl

class LORELEIKBLoader:
    """
        Class for loading the LORELEI knowledge base (KB).
    """
    def __init__(self, kbfile):
        self.kb = {}  # eid --> item
        self.name2ent = {}
        self.load_kb(kbfile)

    """
        Primary function of the class -- loads the LORELEI kb which is to
        be found at the provided path. 
    """

    def load_kb(self, kbfile):
        e2e_path = kbfile + "e2e.pkl"
        n2e_path = kbfile + "n2e.pkl"
        if os.path.exists(e2e_path):
            logging.info("pkl found! loading map %s", e2e_path)
            self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r')
            self.name2ent = SqliteDict(n2e_path,
                                       tablename='name2ent',
                                       flag='r')
        else:
            logging.info("pkl not found ...")
            self.kb = SqliteDict(e2e_path,
                                 tablename='lorelei',
                                 autocommit=False)
            self.name2ent = SqliteDict(n2e_path,
                                       tablename='name2ent',
                                       autocommit=False)
            try:
                for idx, line in enumerate(open(kbfile)):
                    if idx > 0 and idx % 1000000 == 0:
                        logging.info("read %d lines", idx)

                    parts = line.rstrip('\n').split('\t')

                    if len(parts) != len(fields):
                        logging.info("bad line %d", idx)
                        continue

                    endict = {}

                    for field, v in zip(fields, parts):
                        if len(v) != 0:
                            endict[field] = v
                        self.kb[endict['entityid']] = endict
                        name = endict['name']

                        if name not in self.name2ent:
                            self.name2ent[name] = []
                        lst = self.name2ent[name]
                        lst.append(endict)
                        self.name2ent[name] = lst

                    logging.info("Writing KB dictionary to disk.")
                    self.kb.commit()
                    self.kb.close()
                    self.name2ent.commit()
                    self.name2ent.close()

                    self.kb = SqliteDict(e2e_path,
                                         tablename='lorelei',
                                         flag='r')
                    self.name2ent = SqliteDict(n2e_path,
                                               tablename='name2ent',
                                               flag='r')
            except KeyboardInterrupt:
                logging.info("ending prematurely.")
                logging.info("Writing KB dictionary to disk.")
                self.kb.commit()
                self.kb.close()
                self.name2ent.commit()
                self.name2ent.close()
                # reopen the kb now
                self.kb = SqliteDict(e2e_path, tablename='geonames', jflag='r')
                self.name2ent = SqliteDict(n2e_path,
                                           tablename='name2ent',
                                           flag='r')

        def __getitem__(self, item):
            return self.kb[item]

        def keys(self):
            return self.kb.keys()

コード例 #28

0

ファイルを表示

ファイル: OptView_baseclass.py プロジェクト: yqliaohk/pyoptsparse

    def OptimizationHistory(self):
        """
        Reads in database history file and stores contents.
        Function information is stored as a dict in func_data,
        variable information is stored as a dict in var_data,
        and bounds information is stored as a dict in bounds.
        """

        # Initialize dictionaries for design variables and unknowns.
        # The data is saved redundantly in dicts for all iterations and then
        # for major iterations as well.
        self.func_data_all = {}
        self.func_data_major = {}
        self.var_data_all = {}
        self.var_data_major = {}
        db = {}
        self.num_iter = 0

        # Loop over each history file name provided by the user.
        for histIndex, histFileName in enumerate(self.histList):

            # If they only have one history file, we don't change the keys' names
            if len(self.histList) == 1:
                histIndex = ''
            else:  # If multiple history files, append letters to the keys,
                # such that 'key' becomes 'key_A', 'key_B', etc
                histIndex = '_' + chr(histIndex + ord('A'))
            self.histIndex = histIndex

            try:  # This is the classic method of storing history files
                db = shelve.open(histFileName, 'r')
                OpenMDAO = False
            except:  # Bare except because error is not in standard Python.
                # If the db has the 'iterations' tag, it's an OpenMDAO db.
                db = SqliteDict(histFileName, 'iterations')
                OpenMDAO = True

                # Need to do this since in py3 db.keys() is a generator object
                keys = [i for i in db.keys()]

                # If it has no 'iterations' tag, it's a pyOptSparse db.
                if keys == []:
                    OpenMDAO = False
                    db = SqliteDict(histFileName)

            # Specific instructions for OpenMDAO databases
            if OpenMDAO:

                # Get the number of iterations by looking at the largest number
                # in the split string names for each entry in the db
                if major_python_version == 3:
                    for string in db.keys():
                        string = string.split('|')
                else:
                    string = db.keys()[-1].split('|')

                nkey = int(string[-1])
                self.solver_name = string[0]

                # Initalize a list detailing if the iterations are major or minor
                self.iter_type = np.zeros(nkey)

                # Get the keys of the database where derivatives were evaluated.
                # These correspond to major iterations, while no derivative
                # info is calculated for gradient-free linesearches.
                deriv_keys = SqliteDict(histFileName, 'derivs').keys()
                self.deriv_keys = [
                    int(key.split('|')[-1]) for key in deriv_keys
                ]

                # Save information from the history file for the funcs.
                self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO)

                # Save information from the history file for the unknowns.
                self.SaveDBData(db,
                                self.func_data_all,
                                self.func_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str='Unknowns')

                # Save information from the history file for the design variables.
                self.SaveDBData(db,
                                self.var_data_all,
                                self.var_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str='Parameters')

                # Add labels to OpenMDAO variables.
                # Corresponds to constraints, design variables, and objective.
                try:
                    db = SqliteDict(histFileName, 'metadata')
                    self.SaveOpenMDAOData(db)

                except KeyError:  # Skip metadata info if not included in OpenMDAO hist file
                    pass

            else:

                # Get the number of iterations
                nkey = int(db['last']) + 1
                self.nkey = nkey

                # Initalize a list detailing if the iterations are major or minor
                self.iter_type = np.zeros(nkey)

                # Check to see if there is bounds information in the db file.
                # If so, add them to self.bounds to plot later.
                try:
                    try:
                        info_dict = db['varInfo'].copy()
                        info_dict.update(db['conInfo'])
                        scale_info = True
                    except KeyError:
                        self.warning_display(
                            'This is an older optimization history file.\n' +
                            'Only bounds information has been stored, not scalar info.'
                        )
                        info_dict = db['varBounds'].copy()
                        info_dict.update(db['conBounds'])
                        scale_info = False

                    # Got to be a little tricky here since we're modifying
                    # info_dict; if we simply loop over it with the generator
                    # from Python3, it will contain the new keys and then the
                    # names will be mangled incorrectly.
                    bounds_dict = {}
                    scaling_dict = {}
                    for key in info_dict.keys():
                        bounds_dict[key + histIndex] = {
                            'lower': info_dict[key]['lower'],
                            'upper': info_dict[key]['upper']
                        }
                        if scale_info:
                            scaling_dict[key +
                                         histIndex] = info_dict[key]['scale']

                    self.bounds.update(bounds_dict)
                    if scale_info:
                        self.scaling.update(scaling_dict)
                except KeyError:
                    pass

                # Check to see if there is proper saved info about iter type
                if 'isMajor' in db['0'].keys():
                    self.storedIters = True
                else:
                    self.storedIters = False

                # Save information from the history file for the funcs.
                self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO)

                # Save information from the history file for the funcs.
                self.SaveDBData(db,
                                self.func_data_all,
                                self.func_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str='funcs')

                # Save information from the history file for the design variables.
                self.SaveDBData(db,
                                self.var_data_all,
                                self.var_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str='xuser')

        # Set the initial dictionaries to reference all iterations.
        # Later this can be set to reference only the major iterations.
        self.func_data = self.func_data_all
        self.var_data = self.var_data_all

        # Find the maximum length of any variable in the dictionaries and
        # save this as the number of iterations.
        for data_dict in [self.func_data, self.var_data]:
            for key in data_dict.keys():
                length = len(data_dict[key])
                if length > self.num_iter:
                    self.num_iter = length

コード例 #29

0

ファイルを表示

ファイル: simserver.py プロジェクト: StevenLOL/gensim-simserver

class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks
        self.lock_update = threading.RLock(
        ) if use_locks else gensim.utils.nocm
        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)

    def location(self, name):
        return os.path.join(self.basename, name)

    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate(
                    )  # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(
                journal_mode=JOURNAL_MODE
            )  # buffer defaults to a random location in temp
        self.fresh_docs.sync()

    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
            #            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" %
                               docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()

    @gensim.utils.synchronous('lock_update')
    def train(self,
              corpus=None,
              method='auto',
              clear_buffer=True,
              params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning(
                    "too few training documents; using simple log-entropy model instead of latent semantic indexing"
                )
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'),
                                        self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return  # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'),
                                      self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate()  # delete old files
        self.fresh_index = None
        self.flush(save_index=True)

    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr +
                    "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
            self.model = None

コード例 #30

0

ファイルを表示

from sqlitedict import SqliteDict

db = SqliteDict()
opts = ["ipopt", "slsqp", "snopt", "fsqp", "conmin", "nlpqlp", "psqp"]
for opt in opts:
    fileName = "%s_hs015_Hist.hst" % opt
    try:
        db[opt] = SqliteDict(fileName)
    except:  # noqa: E722
        pass

obj = {}
x1 = {}
x2 = {}

for opt in db.keys():
    n = int(db[opt]["last"])

    obj[opt] = []
    x1[opt] = []
    x2[opt] = []
    for i in range(n):
        try:
            obj[opt].append(db[opt]["%d" % i]["funcs"]["obj"])
            x1[opt].append(db[opt]["%d" % i]["xuser"]["xvars"][0])
            x2[opt].append(db[opt]["%d" % i]["xuser"]["xvars"][1])
        except:  # noqa: E722
            pass

# Generate the Rosenbrock contours
delta = 0.25

コード例 #31

0

ファイルを表示

ファイル: merge_hlls.py プロジェクト: nic-at/dns-magnitude-lroot

outputfile = args.outputfile
files = args.file
reset = args.reset


if reset:
    flag = 'w'
else:
    flag = 'c'

mergehll = SqliteDict(outputfile, flag=flag, journal_mode='MEMORY',
                      encode=hll_encode, decode=hll_decode)

mergekeys = set()
for k in mergehll.keys():
    mergekeys.add(k)

for f in files:
    if not os.path.exists(f):
        print("{}: File not found".format(f))
        sys.exit()
    filehll = SqliteDict(f, encode=hll_encode, decode=hll_decode)
    # merge into outputfile
    for tld, tldhll in filehll.iteritems():
        if tld in mergekeys:   # key already in outputfile -> merge
            hll = mergehll[tld]
            if not hll.__eq__(tldhll):  # merge only if HLL is different
                hll.update(tldhll)
                mergehll[tld] = hll
        else:                   # new key in outfile -> copy

コード例 #32

0

ファイルを表示

ファイル: mailbox.py プロジェクト: inpos/serpent

class IMAPMailbox(ExtendedMaildir):
    implements(imap4.IMailbox, imap4.ICloseableMailbox)
    
    AppendFactory = SerpentAppendMessageTask

    def __init__(self, path):
        maildir.initializeMaildir(path)
        self.listeners = []
        self.path = path
        self.open_flags()
        self.lastadded = None
        self.__check_flags_()
    
    def open_flags(self):
        self.msg_info = SqliteDict(os.path.join(self.path, conf.imap_msg_info))
        self.mbox_info = SqliteDict(os.path.join(self.path, conf.imap_mbox_info))

    def _start_monitor(self):
        self.notifier = inotify.INotify()
        self.notifier.startReading()
        self.notifier.watch(filepath.FilePath(os.path.join(self.path, 'new')),
                   callbacks=[self._new_files])
        self.notifier.watch(filepath.FilePath(os.path.join(self.path,'cur')),
                   callbacks=[self._new_files])

    def _stop_monitor(self):
        self.notifier.stopReading()
        self.notifier.loseConnection()

    def _new_files(self, wo, path, code):
        if code == inotify.IN_MOVED_TO or code == inotify.IN_DELETE:
            for l in self.listeners:
                l.newMessages(self.getMessageCount(), self.getRecentCount())

    def __check_flags_(self):
        if 'subscribed' not in self.mbox_info.keys(): self.mbox_info['subscribed'] = False
        if 'flags' not in self.mbox_info.keys(): self.mbox_info['flags'] = []
        if 'special' not in self.mbox_info.keys(): self.mbox_info['special'] = ''
        if 'uidvalidity' not in self.mbox_info.keys(): self.mbox_info['uidvalidity'] = random.randint(0, 2**32)
        if 'uidnext' not in self.mbox_info.keys(): self.mbox_info['uidnext'] = 1
        #self.mbox_info.commit(blocking=False)    # XXX
        l = [l for l in self.__msg_list_()]
        for i in l:
            fn = i.split('/')[-1]
            if fn not in self.msg_info.keys():
                val1 = {'uid': self.getUIDNext()}
                if i.split('/')[-2] == 'new':
                    val1['flags'] = []
                else:
                    val1['flags'] = [misc.IMAP_FLAGS['SEEN']]
                self.msg_info[fn] = val1
        #self.msg_info.commit(blocking=False)    # XXX

    def subscribe(self):
        self.mbox_info['subscribed'] = True
        #self.mbox_info.commit(blocking=False)    # XXX

    def unsubscribe(self):
        self.mbox_info['subscribed'] = False
        #self.mbox_info.commit(blocking=False)    # XXX
    
    def is_subscribed(self):
        return self.mbox_info['subscribed']

    def __count_flagged_msgs_(self, flag):
        val1 = [0 for fn in self.msg_info.keys() if flag in self.msg_info[fn]['flags']]
        return len(val1)
    
    def getHierarchicalDelimiter(self):
        return misc.IMAP_HDELIM

    def setSpecial(self, special):
        self.mbox_info['special'] = special
        #self.mbox_info.commit(blocking=False)    # XXX

    def getFlags(self):
        return sorted(misc.IMAP_FLAGS.values())
    
    def getMboxFlags(self):
        f = list(self.mbox_info['flags'])
        if self.mbox_info['special'] != '': f.append(self.mbox_info['special'])
        return f
    
    def addFlag(self, flag):
        self.mbox_info['flags'] = list(set(self.mbox_info['flags']).union([flag]))
        #self.mbox_info.commit(blocking=False)    # XXX
    
    def removeFlag(self, flag):
        self.mbox_info['flags'] = list(set(self.mbox_info['flags']).difference([flag]))
        #self.mbox_info.commit(blocking=False)    # XXX
    
    def hasChildren(self):
        flags = self.getFlags()
        if misc.MBOX_FLAGS['HASCHILDREN'] not in flags:
            self.addFlag(misc.MBOX_FLAGS['HASCHILDREN'])
        if misc.MBOX_FLAGS['HASNOCHILDREN'] in flags:
            self.removeFlag(misc.MBOX_FLAGS['HASNOCHILDREN'])
    def hasNoChildren(self):
        flags = self.getFlags()
        if misc.MBOX_FLAGS['HASNOCHILDREN'] not in flags:
            self.addFlag(misc.MBOX_FLAGS['HASNOCHILDREN'])
        if misc.MBOX_FLAGS['HASCHILDREN'] in flags:
            self.removeFlag(misc.MBOX_FLAGS['HASCHILDREN'])

    def getMessageCount(self):
        val1 = [0 for fn in self.msg_info.keys() if misc.IMAP_FLAGS['DELETED'] not in self.msg_info[fn]['flags']]
        return len(val1)

    def getRecentCount(self):
        c = 0
        for fn in self.msg_info.keys():
            if misc.IMAP_FLAGS['RECENT'] in self.msg_info[fn]['flags']:
                c += 1
                info = self.msg_info[fn]
                info['flags'] = set(info['flags']).difference(set([misc.IMAP_FLAGS['RECENT']]))
                self.msg_info[fn] = info
        #self.msg_info.commit(blocking=False)    # XXX
        return c
    
    def getUnseenCount(self):
        return self.getMessageCount() - self.__count_flagged_msgs_(misc.IMAP_FLAGS['SEEN'])

    def isWriteable(self):
        return True

    def getUIDValidity(self):
        return self.mbox_info['uidvalidity']
    
    def getUIDNext(self):
        un = self.mbox_info['uidnext']
        self.mbox_info['uidnext'] += 1
        #self.mbox_info.commit(blocking=False)    # XXX
        return un
    
    def getUID(self, num):
        return num

    def addMessage(self, message, flags = (), date = None):
        return self.appendMessage(message).addCallback(self._cbAddMessage, flags)
    
    def _cbAddMessage(self, obj, flags):
        path = self.lastadded
        self.lastadded = None
        fn = path.split('/')[-1]
        self.msg_info[fn] = {'uid': self.getUIDNext(), 'flags': flags}
        #self.msg_info.commit(blocking=False)    # XXX
        if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur':
            new_path = os.path.join(self.path, 'cur', fn)
            os.rename(path, new_path)

    def __msg_list_(self):
        a = []
        for m in os.listdir(os.path.join(self.path, 'new')):
            a.append(os.path.join(self.path, 'new', m))
        for m in os.listdir(os.path.join(self.path, 'cur')):
            a.append(os.path.join(self.path, 'cur', m))
        return a

    def _seqMessageSetToSeqDict(self, messageSet):
        if not messageSet.last:
            messageSet.last = self.getMessageCount()

        seqMap = {}
        msgs = self.__msg_list_()
        for messageNum in messageSet:
            if messageNum > 0 and messageNum <= self.getMessageCount():
                seqMap[messageNum] = msgs[messageNum - 1]
        return seqMap

    def fetch(self, messages, uid):
        return [[seq, MaildirMessage(seq,
                                     file(filename, 'rb').read(),
                                     self.msg_info[filename.split('/')[-1]]['flags'],
                                     rfc822date())]
                for seq, filename in self.__fetch_(messages, uid).iteritems()]
    def __fetch_(self, messages, uid):
        if uid:
            messagesToFetch = {}
            if not messages.last:
                messages.last = self.mbox_info['uidnext']
            fn_uid = dict((fn, self.msg_info[fn]['uid']) for fn in self.msg_info.keys())
            for uid in messages:
                if uid in fn_uid.values():
                    for name, _id in fn_uid.iteritems():
                        if uid == _id:
                            if os.path.exists(os.path.join(self.path,'new', name)):
                                messagesToFetch[uid] = os.path.join(self.path,'new', name)
                            elif os.path.exists(os.path.join(self.path,'cur', name)):
                                messagesToFetch[uid] = os.path.join(self.path,'cur', name)
        else:
            messagesToFetch = self._seqMessageSetToSeqDict(messages)
        return messagesToFetch
    def store(self, messages, flags, mode, uid):
        d = {}
        for _id, path in self.__fetch_(messages, uid).iteritems():
            filename = path.split('/')[-1]
            if mode < 0:
                old_f = self.msg_info[filename]
                old_f['flags'] = list(set(old_f['flags']).difference(set(flags)))
                self.msg_info[filename] = old_f
                if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'new':
                    new_path = os.path.join(self.path, 'new', filename)
                    os.rename(path, new_path)
            elif mode == 0:
                old_f = self.msg_info[filename]
                old_f['flags'] = flags
                self.msg_info[filename] = old_f
            elif mode > 0:
                old_f = self.msg_info[filename]
                old_f['flags'] = list(set(old_f['flags']).union(set(flags)))
                self.msg_info[filename] = old_f
                if misc.IMAP_FLAGS['SEEN'] in flags and path.split('/')[-2] != 'cur':
                    new_path = os.path.join(self.path, 'cur', filename)
                    os.rename(path, new_path)
            d[_id] = self.msg_info[filename]['flags']
        #self.msg_info.commit(blocking=False)    # XXX
        return d
    
    def expunge(self):
        uids = []
        for path in self.__msg_list_():
            fn = path.split('/')[-1]
            if fn not in self.msg_info.keys():
                continue
            uid = self.msg_info[fn]['uid']
            if misc.IMAP_FLAGS['DELETED'] in self.msg_info[fn]['flags']:
                os.remove(path)
                del self.msg_info[fn]
                uids.append(uid)
        #self.msg_info.commit(blocking=False)    # XXX
        return uids
    
    def addListener(self, listener):
        self.listeners.append(listener)
        return True

    def removeListener(self, listener):
        self.listeners.remove(listener)
        return True
    
    def requestStatus(self, names):
        return imap4.statusRequestHelper(self, names)
    
    def destroy(self):
        pass

    def close(self):
        print('!!! %s - %d !!!' % (self.path, len(self.listeners)))
        if len(self.listeners) == 0:
            self._stop_monitor() 
            if conf.imap_expunge_on_close:
                self.expunge()
            self.msg_info.commit(blocking=False)
            self.mbox_info.commit(blocking = False)
            self.msg_info.close()
            self.mbox_info.close()

コード例 #33

0

ファイルを表示

from sqlitedict import SqliteDict

# PageID -> [Parent1, Parent2 ,. ..]
pageID2Parent = SqliteDict('./db/pageID2Parent.sqlite')
# PageID -> [PageTitle, LastModified, Size, WordFreqDict, Children]
pageID2Meta = SqliteDict('./db/pageID2Meta.sqlite')


def findParent(child_id):
    _parents = []
    for parent_id, meta in pageID2Meta.items():
        if int(child_id) in meta[4]:
            _parents.append(parent_id)
    return _parents


i = 0
for page_id in pageID2Meta.keys():
    print(str(i), len(pageID2Meta))
    i += 1
    parents = findParent(page_id)
    pageID2Parent[page_id] = parents

pageID2Parent.commit()
pageID2Parent.close()
pageID2Meta.close()

コード例 #34

0

ファイルを表示

ファイル: drivetrain.py プロジェクト: robfalck/MagnePlane

    print
    print('bat out v: %f ' % top['battery.output_voltage'])
    print('invert in v: %f' % top['inverter.input_voltage'])
    print('mot in volt: %f' % top['motor.phase_voltage'])
    print('invert out volt: %f' % top['inverter.output_voltage'])
    print
    print('invert in pow %f' % top['inverter.input_power'])
    in_pow = top['inverter.input_voltage'] * top['inverter.input_current']
    print('calc inverter in pow %f' % in_pow)
    output_power = top['inverter.output_voltage'] * top[
        'inverter.output_current'] * 3.0 * np.sqrt(2.0 / 3.0)
    print('calc output pow %f' % output_power)
    print('bat des pow: %f' % top['battery.des_power'])
    print
    print('Inv in cur %f' % top['inverter.input_current'])
    print('mot des pow %f' % top['design_power'])
    print('mot input cur %f' % top['motor.phase_current'])
    print('mot input volt %f' % top['motor.phase_voltage'])

    # print('ncells %f' % top['Battery'])

    db = SqliteDict('drivetraindb', 'openmdao')
    pprint(db.keys())
    data = db['rank0:Driver/1']
    pprint(data['Parameters'])
    print
    print
    pprint(data['Unknowns'])
    top.cleanup()
    remove('drivetraindb')

コード例 #35

0

ファイルを表示

ファイル: simserver.py プロジェクト: jannson/gensim-simserver

class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks
        self.lock_update = threading.RLock() if use_locks else gensim.utils.nocm
        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)


    def location(self, name):
        return os.path.join(self.basename, name)


    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate() # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(journal_mode=JOURNAL_MODE) # buffer defaults to a random location in temp
        self.fresh_docs.sync()


    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
#            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" % docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()


    @gensim.utils.synchronous('lock_update')
    def train(self, corpus=None, method='auto', clear_buffer=True, params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning("too few training documents; using simple log-entropy model instead of latent semantic indexing")
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)


    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)


    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate() # delete old files
        self.fresh_index = None
        self.flush(save_index=True)


    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr + "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            except Exception, e:
                logger.warning("failed to delete %s" % fname)
            self.model = None

コード例 #36

0

ファイルを表示

    def OptimizationHistory(self):
        """
        Reads in database history file and stores contents.
        Function information is stored as a dict in func_data,
        variable information is stored as a dict in var_data,
        and bounds information is stored as a dict in bounds.
        """

        # Initialize dictionaries for design variables and unknowns.
        # The data is saved redundantly in dicts for all iterations and then
        # for major iterations as well.
        self.func_data_all = {}
        self.func_data_major = {}
        self.var_data_all = {}
        self.var_data_major = {}
        db = {}
        self.num_iter = 0

        # Loop over each history file name provided by the user.
        for histIndex, histFileName in enumerate(self.histList):

            # If they only have one history file, we don't change the keys' names
            if len(self.histList) == 1:
                histIndex = ""
            else:  # If multiple history files, append letters to the keys,
                # such that 'key' becomes 'key_A', 'key_B', etc
                histIndex = "_" + chr(histIndex + ord("A"))
            self.histIndex = histIndex

            try:  # This is the classic method of storing history files
                db = shelve.open(histFileName, "r")
                OpenMDAO = False
            except Exception:  # Bare except because error is not in standard Python.
                # If the db has the 'iterations' tag, it's an OpenMDAO db.
                db = SqliteDict(histFileName, "iterations")
                OpenMDAO = True

                # Need to do this since in py3 db.keys() is a generator object
                keys = [i for i in db.keys()]

                # If it has no 'iterations' tag, it's a pyOptSparse db.
                if keys == []:
                    OpenMDAO = False
                    db = SqliteDict(histFileName)

            # Specific instructions for OpenMDAO databases
            if OpenMDAO:

                # Get the number of iterations by looking at the largest number
                # in the split string names for each entry in the db
                for string in db.keys():
                    string = string.split("|")

                nkey = int(string[-1])
                self.solver_name = string[0]

                # Initalize a list detailing if the iterations are major or minor
                self.iter_type = np.zeros(nkey)

                # Get the keys of the database where derivatives were evaluated.
                # These correspond to major iterations, while no derivative
                # info is calculated for gradient-free linesearches.
                deriv_keys = SqliteDict(histFileName, "derivs").keys()
                self.deriv_keys = [
                    int(key.split("|")[-1]) for key in deriv_keys
                ]

                # Save information from the history file for the funcs.
                self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO)

                # Save information from the history file for the unknowns.
                self.SaveDBData(db,
                                self.func_data_all,
                                self.func_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str="Unknowns")

                # Save information from the history file for the design variables.
                self.SaveDBData(db,
                                self.var_data_all,
                                self.var_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str="Parameters")

                # Add labels to OpenMDAO variables.
                # Corresponds to constraints, design variables, and objective.
                try:
                    db = SqliteDict(histFileName, "metadata")
                    self.SaveOpenMDAOData(db)

                except KeyError:  # Skip metadata info if not included in OpenMDAO hist file
                    pass

            else:

                # Get the number of iterations
                nkey = int(db["last"]) + 1
                self.nkey = nkey

                # Initalize a list detailing if the iterations are major or minor
                # 1 = major, 2 = minor, 0 = sensitivity (or duplicated info by IPOPT)
                # The entries whose iter_type = 0 will be ignored.
                self.iter_type = np.zeros(nkey)

                # Check to see if there is bounds information in the db file.
                # If so, add them to self.bounds to plot later.
                try:
                    try:
                        info_dict = db["varInfo"].copy()
                        info_dict.update(db["conInfo"])
                        scale_info = True
                    except KeyError:
                        self.warning_display(
                            "This is an older optimization history file.\n" +
                            "Only bounds information has been stored, not scalar info."
                        )
                        info_dict = db["varBounds"].copy()
                        info_dict.update(db["conBounds"])
                        scale_info = False

                    # Got to be a little tricky here since we're modifying
                    # info_dict; if we simply loop over it with the generator
                    # from Python3, it will contain the new keys and then the
                    # names will be mangled incorrectly.
                    bounds_dict = {}
                    scaling_dict = {}
                    for key in info_dict.keys():
                        bounds_dict[key + histIndex] = {
                            "lower": info_dict[key]["lower"],
                            "upper": info_dict[key]["upper"],
                        }
                        if scale_info:
                            scaling_dict[key +
                                         histIndex] = info_dict[key]["scale"]

                    self.bounds.update(bounds_dict)
                    if scale_info:
                        self.scaling.update(scaling_dict)
                except KeyError:
                    pass

                # Check to see if there is proper saved info about iter type
                if "isMajor" in db["0"].keys():
                    self.storedIters = True
                else:
                    self.storedIters = False

                # Raise warning for IPOPT's duplicated history
                if db["metadata"]["optimizer"] == "IPOPT" and "iter" not in db[
                        "0"].keys():
                    pyOptSparseWarning(
                        "The optimization history file has duplicated entries at every iteration, and the OptView plot is not correct. "
                        +
                        "Re-run the optimization with a current version of pyOptSparse to generate a correct history file."
                    )

                # Save information from the history file for the funcs.
                self.DetermineMajorIterations(db, OpenMDAO=OpenMDAO)

                # Save information from the history file for the funcs.
                self.SaveDBData(db,
                                self.func_data_all,
                                self.func_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str="funcs")

                # Save information from the history file for the design variables.
                self.SaveDBData(db,
                                self.var_data_all,
                                self.var_data_major,
                                OpenMDAO=OpenMDAO,
                                data_str="xuser")

        # Set the initial dictionaries to reference all iterations.
        # Later this can be set to reference only the major iterations.
        self.func_data = self.func_data_all
        self.var_data = self.var_data_all

        # Find the maximum length of any variable in the dictionaries and
        # save this as the number of iterations.
        for data_dict in [self.func_data, self.var_data]:
            for key in data_dict.keys():
                length = len(data_dict[key])
                if length > self.num_iter:
                    self.num_iter = length

コード例 #37

0

ファイルを表示

ファイル: OptView.py プロジェクト: hwangjt/OpenAeroStruct

    def OptimizationHistory(self):
        """
        Reads in database history file and stores contents.
        Function information is stored as a dict in func_data,
        variable information is stored as a dict in var_data,
        and bounds information is stored as a dict in bounds.
        """

        # Initialize dictionaries for design variables and unknowns.
        # The data is saved redundantly in dicts for all iterations and then
        # for major iterations as well.
        self.func_data_all = {}
        self.func_data_major = {}
        self.var_data_all = {}
        self.var_data_major = {}
        db = {}
        self.num_iter = 0

        # Loop over each history file name provided by the user.
        for histIndex, histFileName in enumerate(self.histList):

            # If they only have one history file, we don't change the keys' names
            if len(self.histList) == 1:
                histIndex = ''
            else: # If multiple history files, append letters to the keys,
                  # such that 'key' becomes 'key_A', 'key_B', etc
                histIndex = '_' + chr(histIndex + ord('A'))
            self.histIndex = histIndex

            try: # This is the classic method of storing history files
                db = shelve.open(histFileName, 'r')
                OpenMDAO = False
            except: # Bare except because error is not in standard Python.
                # If the db has the 'iterations' tag, it's an OpenMDAO db.
                db = SqliteDict(histFileName, 'iterations')
                OpenMDAO = True

                # If it has no 'iterations' tag, it's a pyOptSparse db.
                if db.keys() == []:
                    OpenMDAO = False
                    db = SqliteDict(histFileName)

            # Specific instructions for OpenMDAO databases
            if OpenMDAO:

                # Get the number of iterations by looking at the largest number
                # in the split string names for each entry in the db
                if major_python_version == 3:
                    for string in db.keys():
                        string
                    string = string.split('|')
                else:
                    string = db.keys()[-1].split('|')

                nkey = int(string[-1])
                self.solver_name = string[0]

                # Initalize a list detailing if the iterations are major or minor
                self.iter_type = np.zeros(nkey)

                # Get the keys of the database where derivatives were evaluated.
                # These correspond to major iterations, while no derivative
                # info is calculated for gradient-free linesearches.
                deriv_keys = SqliteDict(histFileName, 'derivs').keys()
                self.deriv_keys = [int(key.split('|')[-1]) for key in deriv_keys]

                # Save information from the history file for the unknowns.
                self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str='Unknowns')

                # Save information from the history file for the design variables.
                self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str='Parameters')

                # Add labels to OpenMDAO variables.
                # Corresponds to constraints, design variables, and objective.
                try:
                    db = SqliteDict(histFileName, 'metadata')
                    self.SaveOpenMDAOData(db)

                except KeyError: # Skip metadata info if not included in OpenMDAO hist file
                    pass

            else:

                # Get the number of iterations
                nkey = int(db['last']) + 1
                self.nkey = nkey

                # Initalize a list detailing if the iterations are major or minor
                self.iter_type = np.zeros(nkey)

                # Check to see if there is bounds information in the db file.
                # If so, add them to self.bounds to plot later.
                try:
                    bounds_dict = dict(db['varBounds'].items() + db['conBounds'].items())
                    for key in bounds_dict.keys():
                        bounds_dict[key + histIndex] = bounds_dict.pop(key)
                    self.bounds.update(bounds_dict)
                except KeyError:
                    pass

                # Check to see if there is proper saved info about iter type
                if 'isMajor' in db['0'].keys():
                    self.storedIters = True
                else:
                    self.storedIters = False

                # Save information from the history file for the funcs.
                self.SaveDBData(db, self.func_data_all, self.func_data_major, OpenMDAO=OpenMDAO, data_str='funcs')

                # Save information from the history file for the design variables.
                self.SaveDBData(db, self.var_data_all, self.var_data_major, OpenMDAO=OpenMDAO, data_str='xuser')

        # Set the initial dictionaries to reference all iterations.
        # Later this can be set to reference only the major iterations.
        self.func_data = self.func_data_all
        self.var_data = self.var_data_all

        # Find the maximum length of any variable in the dictionaries and
        # save this as the number of iterations.
        for data_dict in [self.func_data, self.var_data]:
            for key in data_dict.keys():
                length = len(data_dict[key])
                if length > self.num_iter:
                    self.num_iter = length

コード例 #38

0

ファイルを表示

ファイル: io_utils.py プロジェクト: ChaseDuncan/lorelei2018

class LORELEIKBLoader:
    """
        Class for loading the LORELEI knowledge base (KB).

        The class allows the KB data to be accessed in two ways, either
        using the unique KB id or using the surface of a mention which
        may map to multiple records in the kb.
    """
    def __init__(self, kbfile):
        # map of entity id to kb record
        self.kb = {}
        # map of surface form of mention to list of kb records to which
        # it may refer.
        self.name2ent = {}
        self._load_kb(kbfile)

    def _load_kb(self, kbfile):
        """
            Helper function which builds the primary resources of the class.
            Loads the LORELEI kb data which is to
            be found at the provided path. Checks if the source data has already
            been preprocessed and stored as a dictionary. If it has, then those
            files are loaded in, if not then the dictonaries are first built and
            saved, then loaded.

            @param: kbfile, the path to source kb which will be loaded
        """
        e2e_path = kbfile + "e2e.pkl"
        n2e_path = kbfile + "n2e.pkl"
        if os.path.exists(e2e_path):
            logging.info("pkl found! loading map %s", e2e_path)
            self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r')
            self.name2ent = SqliteDict(n2e_path,
                                       tablename='name2ent',
                                       flag='r')
        else:
            logging.info("pkl not found ...")
            self.kb = SqliteDict(e2e_path,
                                 tablename='lorelei',
                                 autocommit=False)
            self.name2ent = SqliteDict(n2e_path,
                                       tablename='name2ent',
                                       autocommit=False)
            try:
                for idx, line in enumerate(open(kbfile)):
                    if idx > 0 and idx % 1000000 == 0:
                        logging.info("read %d lines", idx)

                    parts = line.rstrip('\n').split('\t')

                    if len(parts) != len(fields):
                        logging.info("bad line %d", idx)
                        continue

                    endict = {}

                    for field, v in zip(fields, parts):
                        if len(v) != 0:
                            endict[field] = v
                    self.kb[endict['entityid']] = endict
                    name = endict['name']

                    if name not in self.name2ent:
                        self.name2ent[name] = []
                    lst = self.name2ent[name]
                    lst.append(endict)
                    self.name2ent[name] = lst

                logging.info("Writing KB dictionary to disk.")
                self.kb.commit()
                self.kb.close()
                self.name2ent.commit()
                self.name2ent.close()

                self.kb = SqliteDict(e2e_path, tablename='lorelei', flag='r')
                self.name2ent = SqliteDict(n2e_path,
                                           tablename='name2ent',
                                           flag='r')
            except KeyboardInterrupt:
                logging.info("ending prematurely.")
                logging.info("Writing KB dictionary to disk.")
                self.kb.commit()
                self.kb.close()
                self.name2ent.commit()
                self.name2ent.close()
                # reopen the kb now
                self.kb = SqliteDict(e2e_path, tablename='geonames', jflag='r')
                self.name2ent = SqliteDict(n2e_path,
                                           tablename='name2ent',
                                           flag='r')

        def __getitem__(self, item):
            return self.kb[item]

        def keys(self):
            return self.kb.keys()

コード例 #39

0

ファイルを表示

ファイル: buckets.py プロジェクト: ownport/objectstore

 def object_list(self):
     ''' returns a list of objects
     '''
     objects_metadata = SqliteDict(self._meta.filename, 'objects')
     return objects_metadata.keys()