def index_by(index_dir: str, index_extension: str, data_iter: iter,
             key_fn: Callable, value_fn: Callable, checkpoint: int,
             object_name: str):
    """
    Generate UnQlite data indices for each entity
    :param index_dir index directory
    :param index_extension index file extension
    :param data_iter iterable on data
    :param key_fn function to use on data to get the index key
    :param value_fn function to use on data to get the index value
    :param checkpoint commit index every checkpoints
    :return dict of index paths by entity name
    """
    i = 0
    index_path_by_entity = {}
    index_by_entity = {}
    for data in data_iter:
        entity = data['@type']
        if entity not in index_path_by_entity:
            index_path = get_file_path([index_dir, entity],
                                       ext=index_extension)
            index_path_by_entity[entity] = index_path

            index = UnQLite(index_path_by_entity[entity])
            index.begin()
            index_by_entity[entity] = index
        index = index_by_entity[entity]

        # Index
        index[str(key_fn(data))] = value_fn(data)

        i += 1
        # Log
        if i % 50000 == 0:
            print(f'checkpoint: {i} {object_name}')
        # Checkpoint
        if i % checkpoint == 0:
            # Flush indices
            for index in index_by_entity.values():
                index.commit()
                index.begin()
    print(f'checkpoint: {i} {object_name}')

    # Close indices
    for index in index_by_entity.values():
        index.commit()
        index.close()

    # Output all indices
    return index_path_by_entity
Exemple #2
0
class TestTransaction(BaseTestCase):
    """
    We must use a file-based database to test the transaction functions. See
    http://unqlite.org/forum/trouble-with-transactions+1 for details.
    """

    def setUp(self):
        self._filename = "test.db"
        self.db = UnQLite(self._filename)

    def tearDown(self):
        try:
            self.db.close()
        except:
            pass
        if os.path.exists(self._filename):
            os.unlink(self._filename)

    def test_transaction(self):
        @self.db.commit_on_success
        def _test_success(key, value):
            self.db[key] = value

        @self.db.commit_on_success
        def _test_failure(key, value):
            self.db[key] = value
            raise Exception("intentional exception raised")

        _test_success("k1", "v1")
        self.assertEqual(self.db["k1"], "v1")

        self.assertRaises(Exception, lambda: _test_failure("k2", "v2"))
        self.assertRaises(KeyError, lambda: self.db["k2"])

    def test_explicit_transaction(self):
        self.db.close()
        self.db.open()
        self.db.begin()
        self.db["k1"] = "v1"
        self.db.rollback()

        self.assertRaises(KeyError, lambda: self.db["k1"])
            if not all(t.isAlive() for t in threads):
                raise Exception("Threads are dead.")
        print "Finished."
    except KeyboardInterrupt, e:
        stop_evt.set()
        print "Stopped."
        stop_flag = True
    finally:
        db.commit()
        print "Progress Saved."
    return not stop_flag


if __name__ == "__main__":
    import random
    db.begin()
    tokens = [
        map(tk.__getitem__, ('consumer_key', 'consumer_secret', 'access_token',
                             'access_secret'))
        for tk in json.load(open('config/tokens.json'))
    ]
    for f in glob('llt/twitter-events-2012-2016/*.ids'):
        print f
        if not retrieve_tweets(
                f,
                'llt/Data2/%s' % os.path.basename(f),
                THREAD_NUM,
                tokens,
                # proxies = [None],
                # proxies = ['127.0.0.1:49999'],
                proxies=['127.0.0.1:12305'],