Ejemplo n.º 1
0
    def bulk_insert_test():
        """
        当待插入的数据中只有少量数据会损害数据完整性时, 如果我们一条条insert and commit会太慢, 而我们又
        无法一次性insertmany and commit。此时我们如果能将所有数据拆分成大小为 平方根(数据总条数) 的小包,
        其中就会有许多小包都能insertmany成功, 而只有部分小包必须一条条的insert and commit。这样总体速度
        就会得到极大的提高。
        
        为了测试该算法, 我们设定primary key为1 - 10,000。我们预先向数据库内随机插入100条数据。
        然后我们尝试将10000条primary key分别为1 - 10,000的数据插入数据库:
        
        注: sqlite引擎不需要在出错后进行rollback, 所以没有必要在每一次try语句失败后进行rollback, 所以
        该算法的优势体现不出来。为了达到展示算法的目的, 我们强行对每一次insert/insertmany进行commit。
        """
        
        timer = Timer()
        try:
            os.remove("test.db")
        except:
            pass
        
        connect = sqlite3.connect("test.db")
        cursor = connect.cursor()
        cursor.execute("CREATE TABLE test (uuid INTEGER, name TEXT, PRIMARY KEY (uuid) );")
        
        ### 向数据库中预先填充部分数据
        complexity = 100
        records = [(random.randint(1, complexity**2), "abcdefghijklmnopqrstuvwxyz" ) for i in range(complexity)]
        for record in records:
            try:
                cursor.execute("INSERT INTO test VALUES (?,?);", record )
            except:
                pass
        connect.commit()
    
        records = [(i, "abcdefghijklmnopqrstuvwxyz") for i in range(1, complexity**2 + 1)]
        
        def insert1(): # 90.7 seconds/0.9448 seconds, 100/10 complexity
            """尝试插入一条commit一条
            """
            for record in records:
                try:
                    cursor.execute("INSERT INTO test VALUES (?,?);", record )
                    connect.commit()
                except:
                    pass
        
#         timer.test(insert1, 1)
        
        def insert2(): # 31.4067297608 seconds/0.38 seconds, 100/10 complexity
            """尝试把大数据包分拆成 size = sqft(len(data)) 大小的小包, 再尝试将小包整包insert, 如果
            不成功, 则逐条插入
            """
            try:
                cursor.executemany("INSERT INTO test VALUES (?,?);", records)
                connect.commit()
            except:
                for chunk in grouper_list(records, int(sqrt(len(records)))):
                    try:
                        cursor.executemany("INSERT INTO test VALUES (?,?);", chunk)
                        connect.commit()
                    except:
                        for record in chunk:
                            try:
                                cursor.execute("INSERT INTO test VALUES (?,?);", record )
                                connect.commit()
                            except:
                                pass
        
#         timer.test(insert2, 1)
        
        def insert3(): # 8.67 seconds
            """连续两次分拆数据包, 尝试进一步优化速度
            """
            try:
                cursor.executemany("INSERT INTO test VALUES (?,?);", records)
                connect.commit()
            except:
                for chunk in grouper_list(records, int(sqrt(len(records)))):
                    try:
                        cursor.executemany("INSERT INTO test VALUES (?,?);", chunk)
                        connect.commit()
                    except:
                        for smaller_chunk in grouper_list(chunk, int(sqrt(len(chunk)))):
                            try:
                                cursor.executemany("INSERT INTO test VALUES (?,?);", smaller_chunk)
                                connect.commit()
                            except:
                                for record in smaller_chunk:
                                    try:
                                        cursor.execute("INSERT INTO test VALUES (?,?);", record )
                                        connect.commit()
                                    except:
                                        pass
        timer.test(insert3, 1)
Ejemplo n.º 2
0
##encoding=utf-8

from angora.GADGET.pytimer import Timer
from angora.DATA.timewrapper import timewrapper
from angora.STRING.formatmaster import fmter
from datetime import datetime, date, timedelta
import pandas as pd
import unittest
import sqlite3
import random
import os

timer = Timer()


class PandasReadBigFile(unittest.TestCase):
    """
    验证pandas读取大文件时, 如果只读取前两行, 是否需要的时间极短
    """

    #     def setUp(self):
    #         df = list()
    #         complexity = 10000
    #         for _ in range(complexity):
    #             df.append( (
    #                         fmter.tpl.randstr(32),
    #                         random.randint(1, 1000),
    #                         random.random(),
    #                         timewrapper.randdate("1980-01-01", "2015-04-28").\
    #                         strftime("%b %m, %Y"),
    #                         timewrapper.randdatetime("1980-01-01 00:00:00", "2015-04-28 23:59:59").\
Ejemplo n.º 3
0
##encoding=utf-8
"""
Mongodb中如何:
    1. 删除document
    2. 删除整个collection
    3. remove和drop的区别
    
    collecion.remove({})很像find(), 会先query找到匹配的内容, 然后一条条删除之
    而collection.drop()则是删除整个collection.
    如过collection有一些metadata, 例如index, 那么remove({})掉所有的document并不会删除index.
    而drop()则会删除掉这些metadata
"""

from tt00_connect import client, db, users
from angora.GADGET.pytimer import Timer
from angora.STRING.formatmaster import fmter
from pprint import pprint as ppt
from datetime import datetime, date

timer = Timer()

timer.start()
users.remove({})
# users.drop()
timer.timeup()

for doc in users.find():
    print(doc)
##encoding=utf-8

"""
Mongodb中如何:
    1. 删除document
    2. 删除整个collection
    3. remove和drop的区别
    
    collecion.remove({})很像find(), 会先query找到匹配的内容, 然后一条条删除之
    而collection.drop()则是删除整个collection.
    如过collection有一些metadata, 例如index, 那么remove({})掉所有的document并不会删除index.
    而drop()则会删除掉这些metadata
"""

from tt00_connect import client, db, users
from angora.GADGET.pytimer import Timer
from angora.STRING.formatmaster import fmter
from pprint import pprint as ppt
from datetime import datetime, date

timer = Timer()

timer.start()
users.remove({})
# users.drop()
timer.timeup()

for doc in users.find():
    print(doc)
Ejemplo n.º 5
0
    def bulk_insert_test():
        """
        当待插入的数据中只有少量数据会损害数据完整性时, 如果我们一条条insert and commit会太慢, 而我们又
        无法一次性insertmany and commit。此时我们如果能将所有数据拆分成大小为 平方根(数据总条数) 的小包,
        其中就会有许多小包都能insertmany成功, 而只有部分小包必须一条条的insert and commit。这样总体速度
        就会得到极大的提高。
        
        为了测试该算法, 我们设定primary key为1 - 10,000。我们预先向数据库内随机插入100条数据。
        然后我们尝试将10000条primary key分别为1 - 10,000的数据插入数据库:
        
        注: sqlite引擎不需要在出错后进行rollback, 所以没有必要在每一次try语句失败后进行rollback, 所以
        该算法的优势体现不出来。为了达到展示算法的目的, 我们强行对每一次insert/insertmany进行commit。
        """

        timer = Timer()
        try:
            os.remove("test.db")
        except:
            pass

        connect = sqlite3.connect("test.db")
        cursor = connect.cursor()
        cursor.execute(
            "CREATE TABLE test (uuid INTEGER, name TEXT, PRIMARY KEY (uuid) );"
        )

        ### 向数据库中预先填充部分数据
        complexity = 100
        records = [(random.randint(1, complexity**2),
                    "abcdefghijklmnopqrstuvwxyz") for i in range(complexity)]
        for record in records:
            try:
                cursor.execute("INSERT INTO test VALUES (?,?);", record)
            except:
                pass
        connect.commit()

        records = [(i, "abcdefghijklmnopqrstuvwxyz")
                   for i in range(1, complexity**2 + 1)]

        def insert1():  # 90.7 seconds/0.9448 seconds, 100/10 complexity
            """尝试插入一条commit一条
            """
            for record in records:
                try:
                    cursor.execute("INSERT INTO test VALUES (?,?);", record)
                    connect.commit()
                except:
                    pass

#         timer.test(insert1, 1)

        def insert2():  # 31.4067297608 seconds/0.38 seconds, 100/10 complexity
            """尝试把大数据包分拆成 size = sqft(len(data)) 大小的小包, 再尝试将小包整包insert, 如果
            不成功, 则逐条插入
            """
            try:
                cursor.executemany("INSERT INTO test VALUES (?,?);", records)
                connect.commit()
            except:
                for chunk in grouper_list(records, int(sqrt(len(records)))):
                    try:
                        cursor.executemany("INSERT INTO test VALUES (?,?);",
                                           chunk)
                        connect.commit()
                    except:
                        for record in chunk:
                            try:
                                cursor.execute(
                                    "INSERT INTO test VALUES (?,?);", record)
                                connect.commit()
                            except:
                                pass

#         timer.test(insert2, 1)

        def insert3():  # 8.67 seconds
            """连续两次分拆数据包, 尝试进一步优化速度
            """
            try:
                cursor.executemany("INSERT INTO test VALUES (?,?);", records)
                connect.commit()
            except:
                for chunk in grouper_list(records, int(sqrt(len(records)))):
                    try:
                        cursor.executemany("INSERT INTO test VALUES (?,?);",
                                           chunk)
                        connect.commit()
                    except:
                        for smaller_chunk in grouper_list(
                                chunk, int(sqrt(len(chunk)))):
                            try:
                                cursor.executemany(
                                    "INSERT INTO test VALUES (?,?);",
                                    smaller_chunk)
                                connect.commit()
                            except:
                                for record in smaller_chunk:
                                    try:
                                        cursor.execute(
                                            "INSERT INTO test VALUES (?,?);",
                                            record)
                                        connect.commit()
                                    except:
                                        pass

        timer.test(insert3, 1)