def import_csv(csv_path, batch=False, chunksize=500, skiprows=None): start_time = dt.datetime.today().timestamp() for df in pd.read_csv(csv_path, delimiter='\t', encoding='utf-8', dtype=dtype, converters=converters, chunksize=chunksize, skiprows=skiprows, error_bad_lines=False, warn_bad_lines=True): print("transform") df = df.where((pd.notnull(df)), None) print("importing rows {} to {}".format(df.index.min(), df.index.max())) if batch: b = BatchQuery() for i, row in df.iterrows(): try: print(u"DOING {} ; {}".format(i, row['code'].encode('utf-8'))) if row['code'] is None or len(row['code'].strip()) == 0: print("error with line {0} : code = '{1}'".format( i, row['code'].encode('utf-8'))) continue row_converted = { Product._get_column_by_db_name(cql_name).column_name: value for cql_name, value in row.items() if not cql_name.endswith('_datetime') and value is not None and (type(value) != str or len(value) > 0) } product = Product.create(**row_converted) if not batch: product.save() print(u"DONE {} ; {}".format(i, row['code'].encode('utf-8'))) except Exception: print(u"EXCEPTION {} ; {}".format(i, row['code'].encode('utf-8'))) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=10, file=sys.stderr) if batch: print("executing batch for rows {} to {}".format( df.index.min(), df.index.max())) b.execute() time_diff = dt.datetime.today().timestamp() - start_time print("TIMING {} rows/s".format(chunksize / time_diff)) start_time = dt.datetime.today().timestamp()
def populate_customer_invoice_table(): sync_table(CustomerInvoice) customer_invoice = pd.read_csv(CUSTOMER_INVOICE_TABLE_FILE_NAME) # Insert data into Cassandra table in 100 row batches batch_size = 100 batch_current_file_count = 0 batch_manager = BatchQuery() for index, row in tqdm(customer_invoice.iterrows(), total=customer_invoice.shape[0]): CustomerInvoice.batch(batch_manager) \ .create(customer_id=int(row['customer_id']), invoice_date=datetime.strptime(row['invoice_date'], "%Y-%m-%d %H:%M:%S"), product_code=str(row['product_code']), invoice_id=row['invoice_id'], customer_email=row['customer_email'], customer_phone_number=str(row['customer_phone_number']), customer_country=row['customer_country'], customer_postcode=str(row['customer_postcode']), customer_house_number=str(row['customer_house_number']), customer_has_loyalty_card=row['customer_has_loyalty_card'], product_description=row['product_description'], product_unit_price=row['product_unit_price'], product_quantity=row['product_quantity'], invoice_total=row['invoice_total']) batch_current_file_count += 1 if batch_current_file_count == batch_size: batch_manager.execute() batch_current_file_count = 0 batch_manager.execute()
def test_batch_insert_if_not_exists_success(self): """ tests that batch insertion with if_not_exists work as expected """ id = uuid4() with BatchQuery() as b: TestIfNotExistsModel.batch(b).if_not_exists().create(id=id, count=8, text='123456789') b = BatchQuery() TestIfNotExistsModel.batch(b).if_not_exists().create(id=id, count=9, text='111111111111') with self.assertRaises(LWTException) as assertion: b.execute() self.assertEqual(assertion.exception.existing, { 'count': 8, 'id': id, 'text': '123456789', '[applied]': False, }) q = TestIfNotExistsModel.objects(id=id) self.assertEqual(len(q), 1) tm = q.first() self.assertEqual(tm.count, 8) self.assertEqual(tm.text, '123456789')
def test_batch_update_conditional_several_rows(self): sync_table(TestUpdateModel) self.addCleanup(drop_table, TestUpdateModel) first_row = TestUpdateModel.create(partition=1, cluster=1, value=5, text="something") second_row = TestUpdateModel.create(partition=1, cluster=2, value=5, text="something") b = BatchQuery() TestUpdateModel.batch(b).if_not_exists().create(partition=1, cluster=1, value=5, text='something else') TestUpdateModel.batch(b).if_not_exists().create(partition=1, cluster=2, value=5, text='something else') TestUpdateModel.batch(b).if_not_exists().create(partition=1, cluster=3, value=5, text='something else') # The response will be more than two rows because two of the inserts will fail with self.assertRaises(LWTException): b.execute() first_row.delete() second_row.delete() b.execute()
def parse_csv(filepath, username, trip): """ Parse a csv and import to database """ with open(filepath, 'r') as csvfile: reader = csv.DictReader(csvfile.read().split('\n')) i = 0 b = BatchQuery() last_dt = None for i, line in enumerate(reader): if i % 1000 == 0: b.execute() b = BatchQuery() try: dt = parser.parse(line['time']) if dt == last_dt: continue pt = {'lon': line['lon'], 'lat': line['lat'], 'accurracy': line['accuracy'], 'username': username, 'created_at': dt, 'trip_id': trip} last_dt = dt Point.batch(b).create(**pt) except ValueError: continue b.execute() return username, trip
def test_batch_insert_if_not_exists(self): """ tests that batch insertion with if_not_exists work as expected """ id = uuid4() with BatchQuery() as b: TestIfNotExistsModel.batch(b).if_not_exists().create( id=id, count=8, text='123456789') b = BatchQuery() TestIfNotExistsModel.batch(b).if_not_exists().create( id=id, count=9, text='111111111111') with self.assertRaises(LWTException) as assertion: b.execute() self.assertEqual(assertion.exception.existing, { 'count': 8, 'id': id, 'text': '123456789', '[applied]': False, }) q = TestIfNotExistsModel.objects(id=id) self.assertEqual(len(q), 1) tm = q.first() self.assertEqual(tm.count, 8) self.assertEqual(tm.text, '123456789')
def import_data(): connection.setup(['127.0.0.1'], "geonames", protocol_version=3) fieldnames = [col for col in Geoname().__dict__['_values']] with open("geonames/allCountries.txt", encoding='utf8', newline='') as csvfile: # Création du DictReader csv.register_dialect('geoname', delimiter='\t', quoting=csv.QUOTE_NONE) reader = csv.DictReader(csvfile, dialect='geoname', fieldnames=fieldnames) # Ingestion par lots count = 0 batch = BatchQuery() for row in reader: new_row = clean_row(row) Geoname.batch(batch).create(**new_row) count += 1 if not count % 1000: batch.execute() batch = BatchQuery() logger.info('Importés: {}'.format(count)) batch.execute()
def store_cancer_data(self, cancer_data): data_group_info = cancer_data.data_group_info country = data_group_info.country age = data_group_info.age_gender.age gender = data_group_info.age_gender.gender bq = BatchQuery(consistency=ConsistencyLevel.ONE) for cancer_data_record in cancer_data.cancer_data: cancer = cancer_data_record.cancer deaths = cancer_data_record.deaths crude_rate = cancer_data_record.crude_rate asr = cancer_data_record.crude_rate cumulative_risk = cancer_data_record.cumulative_risk CancerDataEntity.batch(bq).create( age=age, gender=gender, country=country, cancer=cancer, asr=asr, crude_rate=crude_rate, cumulative_risk=cumulative_risk, deaths=deaths, ) bq.execute()
def test_insert_success_case(self): b = BatchQuery() inst = TestMultiKeyModel.batch(b).create(partition=self.pkey, cluster=2, count=3, text='4') with self.assertRaises(TestMultiKeyModel.DoesNotExist): TestMultiKeyModel.get(partition=self.pkey, cluster=2) b.execute() TestMultiKeyModel.get(partition=self.pkey, cluster=2)
def save_ts(self, ts_list: List[TSData]): b = BatchQuery() for ts_data in ts_list: func = TSFunctionRegistry.find_function(ts_data.ts_type_name) value_serialized: str = func.serialize(ts_data.values) TimeSeriesDataModel.batch(b).create( type=ts_data.ts_type_name, code=ts_data.code, visible_time=ts_data.visible_time, data=value_serialized) b.execute()
def truncate_models(self): log.info('Purging data from all tables...') for model in self.get_cassandra_models(): b = BatchQuery() for instance in model.objects.all(): instance.batch(b).delete() b.execute() log.info('Done!')
def save(self, ts_list: List[TSData]): b = BatchQuery() for ts_data in ts_list: func = TSTypeRegistry.find_function(ts_data.ts_type_name) if not isinstance(func, HistoryTimeSeriesType): raise RuntimeError("非法的tsType") value_serialized: str = func.serialize(ts_data.values) TimeSeriesDataModel.batch(b).create( type=ts_data.ts_type_name, code=ts_data.code, visible_time=ts_data.visible_time, data=value_serialized) b.execute()
def test_async_batch(self): b = BatchQuery(execute_async=True) TestMultiKeyModel.batch(b).create(partition=self.pkey, cluster=2, count=3, text='4') with self.assertRaises(TestMultiKeyModel.DoesNotExist): TestMultiKeyModel.get(partition=self.pkey, cluster=2) b.execute() TestMultiKeyModel.get(partition=self.pkey, cluster=2)
def process_create_entry_work(self, rdict, context, do_load): b = BatchQuery() # MOSTLY the resource will not exist. So start by calculating the URL and trying to insert the entire record. if not do_load: url = u"file://{}{}/{}".format(decode_str(context['local_ip']), decode_str(context['path']), decode_str(context['entry'])) else: with open(context['fullpath'], 'r') as f: blob = Blob.create_from_file(f, rdict['size']) if blob: url = "cassandra://{}".format(blob.id) else: return None try: # OK -- try to insert ( create ) the record... t1 = time.time() resource = Resource.batch(b).create(url=url, **rdict) msg = u'Resource {} created --> {}'.format(resource.name, time.time() - t1) logger.info(msg) except ResourceConflictError: # If the create fails, the record already exists... so retrieve it... t1 = time.time() resource = Resource.objects().get(container=context['collection'], name=rdict['name']) msg = u"{} ::: Fetch Object -> {}".format(resource.name, time.time() - t1) logger.info(msg) # if the url is not correct then update # TODO: If the URL is a block set that's stored internally, reduce its count so that it can be garbage collected # t3 = None if resource.url != url: t2 = time.time() # if url.startswith('cassandra://') : tidy up the stored block count... resource.batch(b).update(url=url) t3 = time.time() msg = u"{} ::: update -> {}".format(resource.name, t3 - t2) logger.info(msg) # t1 = time.time() SearchIndex.reset(resource.id) SearchIndex.index(resource, ['name', 'metadata']) # msg = "Index Management -> {}".format(time.time() - t1) # logger.info(msg) b.execute()
def test_callbacks_properly_execute_callables_and_tuples(self): call_history = [] def my_callback(*args, **kwargs): call_history.append(args) # adding on init: batch = BatchQuery() batch.add_callback(my_callback) batch.add_callback(my_callback, 'more', 'args') batch.execute() assert len(call_history) == 2 assert [(), ('more', 'args')] == call_history
def test_update_success_case(self): inst = TestMultiKeyModel.create(partition=self.pkey, cluster=2, count=3, text='4') b = BatchQuery() inst.count = 4 inst.batch(b).save() inst2 = TestMultiKeyModel.get(partition=self.pkey, cluster=2) assert inst2.count == 3 b.execute() inst3 = TestMultiKeyModel.get(partition=self.pkey, cluster=2) assert inst3.count == 4
def test_update_success_case(self): inst = TestMultiKeyModel.create(partition=self.pkey, cluster=2, count=3, text='4') b = BatchQuery() inst.count = 4 inst.batch(b).save() inst2 = TestMultiKeyModel.get(partition=self.pkey, cluster=2) self.assertEqual(inst2.count, 3) b.execute() inst3 = TestMultiKeyModel.get(partition=self.pkey, cluster=2) self.assertEqual(inst3.count, 4)
def test_callbacks_properly_execute_callables_and_tuples(self): call_history = [] def my_callback(*args, **kwargs): call_history.append(args) # adding on init: batch = BatchQuery() batch.add_callback(my_callback) batch.add_callback(my_callback, 'more', 'args') batch.execute() self.assertEqual(len(call_history), 2) self.assertEqual([(), ('more', 'args')], call_history)
def populate_daily_revenue_table(): sync_table(DailyRevenue) daily_revenue = pd.read_csv(DAILY_REVENUE_TABLE_FILE_NAME) batch_size = 200 batch_current_file_count = 0 batch_manager = BatchQuery() for index, row in tqdm(daily_revenue.iterrows(), total=daily_revenue.shape[0]): DailyRevenue.batch(batch_manager) \ .create(invoice_date=datetime.strptime(row['invoice_date'], "%Y-%m-%d"), daily_revenue=row['daily_revenue']) batch_current_file_count += 1 if batch_current_file_count == batch_size: batch_manager.execute() batch_current_file_count = 0 batch_manager.execute()
def append(self, sequenced_item_or_items): if isinstance(sequenced_item_or_items, list): if len(sequenced_item_or_items): b = BatchQuery() for item in sequenced_item_or_items: assert isinstance(item, self.sequenced_item_class), (type(item), self.sequenced_item_class) kwargs = self.get_field_kwargs(item) self.active_record_class.batch(b).if_not_exists().create(**kwargs) try: b.execute() except LWTException as e: self.raise_sequenced_item_error(sequenced_item_or_items, e) else: active_record = self.to_active_record(sequenced_item_or_items) try: active_record.save() except LWTException as e: self.raise_sequenced_item_error(sequenced_item_or_items, e)
def populate_product_sale_counts_table(): sync_table(ProductSaleCounts) product_sale_counts = pd.read_csv(PRODUCT_SALE_COUNTS_FILE_NAME) batch_size = 200 batch_current_file_count = 0 batch_manager = BatchQuery() for index, row in tqdm(product_sale_counts.iterrows(), total=product_sale_counts.shape[0]): ProductSaleCounts.batch(batch_manager) \ .create(product_code=str(row['product_code']), product_total_quantity_sold=row['product_total_quantity_sold'], product_description=row['product_description']) batch_current_file_count += 1 if batch_current_file_count == batch_size: batch_manager.execute() batch_current_file_count = 0 batch_manager.execute()
def populate_product_revenue_table(): sync_table(ProductRevenue) product_revenue = pd.read_csv(PRODUCT_REVENUE_TABLE_FILE_NAME) batch_size = 200 batch_current_file_count = 0 batch_manager = BatchQuery() for index, row in tqdm(product_revenue.iterrows(), total=product_revenue.shape[0]): ProductRevenue.batch(batch_manager) \ .create(product_code=str(row['product_code']), product_total_revenue=row['product_total_revenue'], product_description=row['product_description']) batch_current_file_count += 1 if batch_current_file_count == batch_size: batch_manager.execute() batch_current_file_count = 0 batch_manager.execute()
def batch_save(models, ttl=None): ''' Executes the batch query ''' if len(models) <= MAX_BATCH_SIZE: b = BatchQuery() for m in models: if (ttl is not None): m.batch(b).ttl(ttl).save() else: m.batch(b).save() b.execute() else: start = 0 end = MAX_BATCH_SIZE while end <= len(models): batch_save(models[start:end]) start = end if end == len(models): break end = len(models) if ( end + MAX_BATCH_SIZE) > len(models) else end + MAX_BATCH_SIZE
def test_batch_insert_if_not_exists_success(self): """ tests that batch insertion with if_not_exists work as expected """ id = uuid4() with BatchQuery() as b: TestIfNotExistsModel.batch(b).if_not_exists().create( id=id, count=8, text='123456789') b = BatchQuery() TestIfNotExistsModel.batch(b).if_not_exists().create( id=id, count=9, text='111111111111') with self.assertRaises(LWTException): b.execute() q = TestIfNotExistsModel.objects(id=id) self.assertEqual(len(q), 1) tm = q.first() self.assertEqual(tm.count, 8) self.assertEqual(tm.text, '123456789')
def append(self, sequenced_item_or_items): if isinstance(sequenced_item_or_items, list): if len(sequenced_item_or_items): b = BatchQuery() for item in sequenced_item_or_items: assert isinstance( item, self.sequenced_item_class), (type(item), self.sequenced_item_class) kwargs = self.get_field_kwargs(item) self.record_class.batch(b).if_not_exists().create(**kwargs) try: b.execute() except LWTException: self.raise_sequenced_item_conflict() else: record = self.to_record(sequenced_item_or_items) try: record.save() except LWTException: self.raise_sequenced_item_conflict()
def test_batch_update_conditional(self): t = TestConditionalModel.create(text='something', count=5) id = t.id with BatchQuery() as b: t.batch(b).iff(count=5).update(text='something else') updated = TestConditionalModel.objects(id=id).first() self.assertEqual(updated.text, 'something else') b = BatchQuery() updated.batch(b).iff(count=6).update(text='and another thing') with self.assertRaises(LWTException) as assertion: b.execute() self.assertEqual(assertion.exception.existing, { 'id': id, 'count': 5, '[applied]': False, }) updated = TestConditionalModel.objects(id=id).first() self.assertEqual(updated.text, 'something else')
def test_empty_batch(self): b = BatchQuery() b.execute() with BatchQuery() as b: pass
cord_id=row.cord_id, acronym=row.acronym, kpi_name=row.kpi_name, kpi_version=row.kpi_version, value=row.value) else: PlmnKeyNotFound.batch(keys_not_found_queries).create( kpi_basename=row.kpi_basename, date=row.date, cord_id=row.cord_id, acronym=row.acronym, kpi_name=row.kpi_name, kpi_version=row.kpi_version, value=row.value) try: bad_queries.execute() keys_not_found_queries.execute() good_queries.execute() except Exception as e: log_file.write(str(e)) pass min_date -= step max_date -= step end_time = time.time() total_time += (end_time - start_time) print("Cord_id %s processed in %s" % (cord, (end_time - start_time))) log_file.write("Cord %s processed \n" % cord) print("Total processing time %s" % total_time)
parentid = columns.Integer(primary_key=True) childid = columns.Integer(primary_key=True, clustering_order="DESC") type = columns.Text() # Se connecte au keyspace geonames connection.setup(['127.0.0.1'], "geonames", protocol_version=3) sync_table(HierarchyModel) # Importe les données : # La table sera donc présentée sous la forme : [parentid, childid, type] with open("geonames/hierarchy.txt", encoding='utf8', newline='') as csvfile: # Création du DictReader reader = csv.DictReader(csvfile, fieldnames=['parentid', 'childid', 'type'], delimiter='\t') # Ingestion par lots count = 0 batch = BatchQuery() for row in reader: new_row = clean_row(row) HierarchyModel.batch(batch).create(**new_row) count += 1 if not count % 1000: batch.execute() batch = BatchQuery() logger.info('Importés: {}'.format(count)) batch.execute()