def row_generator(): row_list = [ Row(("movie_id", "title"), ("m0004", "Madagascar 2")), Row(("movie_id", "title"), ("m0005", "Madagascar 2")), ] for row in row_list: yield row
def clone_from_data_stream(self, documents): """用于从0开始, 从批量文档中生成数据库, 性能较高 1. 往主表格中填充一条文档 2. 从文档中生成倒排索引的索引字典 3. 往所有的索引表中填充索引 invert_index = {keyword: set_of_uuid} """ import time st = time.clock() print("正在往数据库 %s 中填充数据..." % self.schema.schema_name) # 初始化all_inv_dict all_inv_dict = dict() for keyword_field in self.schema.keyword_fields: # initialize a empty dict for invert index all_inv_dict[keyword_field] = dict() ins = self.main_table.insert() counter = 0 for document in documents: counter += 1 # 将字典document转化为row columns, values = list(), list() for k, v in document.items(): columns.append(k) values.append(v) row = Row(columns, values) # 更新主表的数据 try: self.engine.insert_row(ins, row) except: pass # 计算倒排索引 for keyword_field in self.schema.keyword_fields: uuid = document[self.schema.uuid] for keyword in document[keyword_field]: if keyword in all_inv_dict[keyword_field]: all_inv_dict[keyword_field][keyword].add(uuid) else: all_inv_dict[keyword_field][keyword] = set([ uuid, ]) # 将all_inv_dict中的数据存入索引表中 for keyword_field in all_inv_dict: table = self.get_table(keyword_field) ins = table.insert() for keyword, uuid_set in all_inv_dict[keyword_field].items(): self.engine.insert_record(ins, (keyword, uuid_set)) print("\t数据库准备完毕, 一共插入了 %s 条数据, 可以进行搜索了! 一共耗时 %s 秒" % ( counter, (time.clock() - st), ))
def add_one(self, document): """用于往数据库中添加数据, 以增量更新的模式更新索引, 性能较低 1. 往主表格中填充一条文档 2. 更新倒排索引里的表中的索引数据 """ # 将字典document转化为row columns, values = list(), list() for k, v in document.items(): columns.append(k) values.append(v) row = Row(columns, values) # 更新主表的数据 ins = self.main_table.insert() self.engine.insert_row(ins, row) # 对每一个field所对应的表进行更新 for keyword_field in self.schema.keyword_fields: table = self.get_table(keyword_field) ins = table.insert() upd = table.update() # 对表中所涉及的keyword的行进行更新 for keyword in document[keyword_field]: try: # 尝试插入新的, 若已经存在, 则进入更新 self.engine.insert_row( ins, Row(["keyword", "uuid_set"], [keyword, set([document[self.schema.uuid]])])) except: # 对keyword的行进行更新 a = self.engine.select( Select([table.uuid_set ]).where(table.keyword == keyword)) new_uuid_set = list(a)[0][0] print(new_uuid_set) new_uuid_set.add(document[self.schema.uuid]) self.engine.update( upd.values(uuid_set=new_uuid_set).where( table.keyword == keyword))
def Engine_insert_and_update_unittest(): ins = movie.insert() engine.insert_many_records(ins, records) new_records = [ ("m0001", "The Shawshank Redemption", 999, None, date(1994, 10, 14), {"Drama", "Crime"}), ("m0002", "The Godfather", 999, None, date(1972, 3, 24), {"Crime", "Drama"}), ] new_rows = [ Row(("movie_id", "title", "length", "release_date", "genres"), ("m0004", "12 Angry Men", 999, date(1957, 4, 11), {"Drama"})), Row(("movie_id", "title", "length", "release_date", "genres"), ("m0005", "Schindler's List", 999, date( 1994, 2, 4), {"Biography", "Drama", "History"})), ] engine.insert_and_update_many_records(ins, new_records) engine.insert_and_update_many_rows(ins, new_rows) engine.prt_all(movie)
def test1(): """测试逐条插入的方法 """ record = ("m0001", "Yes Man!", 95, 6.2, "2010-01-01", {"Drama", "Fantasy"}) row = Row( ("movie_id", "title", "length", "rate", "release_date", "genres"), ("m0002", "Yes Man!", 95, 6.2, "2010-01-01", {"Drama", "Fantasy"})) engine.insert_record(ins, record) engine.insert_row(ins, row) engine.prt_all(movie)
def Row_unittest(): """测试Row的属性和方法 """ row = Row(("movie_id", "title"), ("m0002", "Madagascar 2")) tplt._straightline("__str__ and __repr__ method") print(row) print(repr(row)) tplt._straightline("access attribute and item") print(row.movie_id) print(row["title"]) # change row item row["title"] = "good" print(row)
def Insert_unittest(): record = ("m0001", "Yes Man!", 95, 6.3, "2010-01-01", {"Drama", "Fantasy"}) row = Row(("movie_id", "title", "genres"), ("m0002", "Madagascar 2", {"Anime", "Comedy"})) ins = movie.insert() tplt._straightline("insert record sqlcmd") ins.sqlcmd_from_record() print(ins.insert_sqlcmd) tplt._straightline("insert row sqlcmd") ins.sqlcmd_from_row(row) print(ins.insert_sqlcmd) tplt._straightline("record converter") print(ins.default_record_converter(record)) tplt._straightline("row converter") print(ins.default_row_converter(row))
Column("intlist_type", datatype.intlist, default=IntList([8, 9, 10])), Column("pickletype_type", datatype.pickletype, default={ 1: "a", 2: "b", 3: "c" }), ) print(pythontype.create_table_sql()) metadata.create_all(engine) ins = pythontype.insert() # record = (1, datetime.date(1999,1,1), datetime.datetime(2000,1,1,0,30,45), # [1,2,3], {1,2,3}, {1:"a", 2:"b", 3: "c"}, OrderedDict({1:"a", 2:"b", 3: "c"}), # StrSet(["a", "b", "c"]), IntSet([1, 2, 3]), StrList(["x", "y", "z"]), IntList([9, 8, 7]), # MyClass(1000)) # engine.insert_record(ins, record) row = Row(["uuid"], [1]) engine.insert_row(ins, row) for record in engine.execute("SELECT * FROM pythontype"): print(record) for record in engine.select(Select(pythontype.all)): print(record) engine.prt_all(pythontype)
conn = engine.connect c = engine.cursor metadata = MetaData() datatype = DataType() col_ID = Column("ID", datatype.integer, primary_key=True) col_name = Column("name", datatype.text) test = Table("test", metadata, col_ID, col_name) metadata.create_all(engine) ins = test.insert() records = [(i, "abcdefghijklmnopqrstuvwxyz") for i in range(1000)] records = records + [(random.randint(1, 1000), "abcdefghijklmnopqrstuvwxyz") for i in range(10)] rows = [ Row(("ID", "name"), (i, "abcdefghijklmnopqrstuvwxyz")) for i in range(1000) ] rows = rows + [ Row(("ID", "name"), (random.randint(1, 1000), "abcdefghijklmnopqrstuvwxyz")) for i in range(10) ] timer = Timer() def insert_test1(): # 4.0 - 5.0 second """test insert record one by one """ timer.start() for record in records: