コード例 #1
0
ファイル: import.py プロジェクト: strapdata/openfoodfacts
def import_csv(csv_path, batch=False, chunksize=500, skiprows=None):
    start_time = dt.datetime.today().timestamp()

    for df in pd.read_csv(csv_path,
                          delimiter='\t',
                          encoding='utf-8',
                          dtype=dtype,
                          converters=converters,
                          chunksize=chunksize,
                          skiprows=skiprows,
                          error_bad_lines=False,
                          warn_bad_lines=True):

        print("transform")
        df = df.where((pd.notnull(df)), None)
        print("importing rows {} to {}".format(df.index.min(), df.index.max()))

        if batch:
            b = BatchQuery()

        for i, row in df.iterrows():
            try:
                print(u"DOING {} ; {}".format(i, row['code'].encode('utf-8')))

                if row['code'] is None or len(row['code'].strip()) == 0:
                    print("error with line {0} : code = '{1}'".format(
                        i, row['code'].encode('utf-8')))
                    continue

                row_converted = {
                    Product._get_column_by_db_name(cql_name).column_name: value
                    for cql_name, value in row.items()
                    if not cql_name.endswith('_datetime') and value is not None
                    and (type(value) != str or len(value) > 0)
                }

                product = Product.create(**row_converted)

                if not batch:
                    product.save()

                print(u"DONE {} ; {}".format(i, row['code'].encode('utf-8')))
            except Exception:
                print(u"EXCEPTION {} ; {}".format(i,
                                                  row['code'].encode('utf-8')))
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=10,
                                          file=sys.stderr)

        if batch:
            print("executing batch for rows {} to {}".format(
                df.index.min(), df.index.max()))
            b.execute()

        time_diff = dt.datetime.today().timestamp() - start_time
        print("TIMING {} rows/s".format(chunksize / time_diff))
        start_time = dt.datetime.today().timestamp()
コード例 #2
0
ファイル: cassandra_insert.py プロジェクト: lalaique/big-data
def populate_customer_invoice_table():
    sync_table(CustomerInvoice)
    customer_invoice = pd.read_csv(CUSTOMER_INVOICE_TABLE_FILE_NAME)
    # Insert data into Cassandra table in 100 row batches
    batch_size = 100
    batch_current_file_count = 0
    batch_manager = BatchQuery()
    for index, row in tqdm(customer_invoice.iterrows(),
                           total=customer_invoice.shape[0]):
        CustomerInvoice.batch(batch_manager) \
            .create(customer_id=int(row['customer_id']),
                    invoice_date=datetime.strptime(row['invoice_date'], "%Y-%m-%d %H:%M:%S"),
                    product_code=str(row['product_code']),
                    invoice_id=row['invoice_id'],
                    customer_email=row['customer_email'],
                    customer_phone_number=str(row['customer_phone_number']),
                    customer_country=row['customer_country'],
                    customer_postcode=str(row['customer_postcode']),
                    customer_house_number=str(row['customer_house_number']),
                    customer_has_loyalty_card=row['customer_has_loyalty_card'],
                    product_description=row['product_description'],
                    product_unit_price=row['product_unit_price'],
                    product_quantity=row['product_quantity'],
                    invoice_total=row['invoice_total'])
        batch_current_file_count += 1
        if batch_current_file_count == batch_size:
            batch_manager.execute()
            batch_current_file_count = 0
    batch_manager.execute()
コード例 #3
0
    def test_batch_insert_if_not_exists_success(self):
        """ tests that batch insertion with if_not_exists work as expected """

        id = uuid4()

        with BatchQuery() as b:
            TestIfNotExistsModel.batch(b).if_not_exists().create(id=id, count=8, text='123456789')

        b = BatchQuery()
        TestIfNotExistsModel.batch(b).if_not_exists().create(id=id, count=9, text='111111111111')
        with self.assertRaises(LWTException) as assertion:
            b.execute()

        self.assertEqual(assertion.exception.existing, {
            'count': 8,
            'id': id,
            'text': '123456789',
            '[applied]': False,
        })

        q = TestIfNotExistsModel.objects(id=id)
        self.assertEqual(len(q), 1)

        tm = q.first()
        self.assertEqual(tm.count, 8)
        self.assertEqual(tm.text, '123456789')
コード例 #4
0
    def test_batch_update_conditional_several_rows(self):
        sync_table(TestUpdateModel)
        self.addCleanup(drop_table, TestUpdateModel)

        first_row = TestUpdateModel.create(partition=1,
                                           cluster=1,
                                           value=5,
                                           text="something")
        second_row = TestUpdateModel.create(partition=1,
                                            cluster=2,
                                            value=5,
                                            text="something")

        b = BatchQuery()
        TestUpdateModel.batch(b).if_not_exists().create(partition=1,
                                                        cluster=1,
                                                        value=5,
                                                        text='something else')
        TestUpdateModel.batch(b).if_not_exists().create(partition=1,
                                                        cluster=2,
                                                        value=5,
                                                        text='something else')
        TestUpdateModel.batch(b).if_not_exists().create(partition=1,
                                                        cluster=3,
                                                        value=5,
                                                        text='something else')

        # The response will be more than two rows because two of the inserts will fail
        with self.assertRaises(LWTException):
            b.execute()

        first_row.delete()
        second_row.delete()
        b.execute()
コード例 #5
0
ファイル: csv.py プロジェクト: dankolbman/stoic
def parse_csv(filepath, username, trip):
    """
    Parse a csv and import to database
    """
    with open(filepath, 'r') as csvfile:
        reader = csv.DictReader(csvfile.read().split('\n'))
        i = 0
        b = BatchQuery()
        last_dt = None
        for i, line in enumerate(reader):
            if i % 1000 == 0:
                b.execute()
                b = BatchQuery()
            try:
                dt = parser.parse(line['time'])
                if dt == last_dt:
                    continue
                pt = {'lon': line['lon'],
                      'lat': line['lat'],
                      'accurracy': line['accuracy'],
                      'username': username,
                      'created_at': dt,
                      'trip_id': trip}
                last_dt = dt
                Point.batch(b).create(**pt)
            except ValueError:
                continue
        b.execute()
    return username, trip
コード例 #6
0
    def test_batch_insert_if_not_exists(self):
        """ tests that batch insertion with if_not_exists work as expected """

        id = uuid4()

        with BatchQuery() as b:
            TestIfNotExistsModel.batch(b).if_not_exists().create(
                id=id, count=8, text='123456789')

        b = BatchQuery()
        TestIfNotExistsModel.batch(b).if_not_exists().create(
            id=id, count=9, text='111111111111')
        with self.assertRaises(LWTException) as assertion:
            b.execute()

        self.assertEqual(assertion.exception.existing, {
            'count': 8,
            'id': id,
            'text': '123456789',
            '[applied]': False,
        })

        q = TestIfNotExistsModel.objects(id=id)
        self.assertEqual(len(q), 1)

        tm = q.first()
        self.assertEqual(tm.count, 8)
        self.assertEqual(tm.text, '123456789')
コード例 #7
0
def import_data():

    connection.setup(['127.0.0.1'], "geonames", protocol_version=3)
    fieldnames = [col for col in Geoname().__dict__['_values']]

    with open("geonames/allCountries.txt", encoding='utf8',
              newline='') as csvfile:

        # Création du DictReader
        csv.register_dialect('geoname', delimiter='\t', quoting=csv.QUOTE_NONE)
        reader = csv.DictReader(csvfile,
                                dialect='geoname',
                                fieldnames=fieldnames)

        # Ingestion par lots
        count = 0
        batch = BatchQuery()
        for row in reader:
            new_row = clean_row(row)
            Geoname.batch(batch).create(**new_row)
            count += 1
            if not count % 1000:
                batch.execute()
                batch = BatchQuery()
                logger.info('Importés: {}'.format(count))

        batch.execute()
コード例 #8
0
    def store_cancer_data(self, cancer_data):
        data_group_info = cancer_data.data_group_info

        country = data_group_info.country
        age = data_group_info.age_gender.age
        gender = data_group_info.age_gender.gender

        bq = BatchQuery(consistency=ConsistencyLevel.ONE)
        for cancer_data_record in cancer_data.cancer_data:
            cancer = cancer_data_record.cancer
            deaths = cancer_data_record.deaths
            crude_rate = cancer_data_record.crude_rate
            asr = cancer_data_record.crude_rate
            cumulative_risk = cancer_data_record.cumulative_risk
            CancerDataEntity.batch(bq).create(
                age=age,
                gender=gender,
                country=country,
                cancer=cancer,
                asr=asr,
                crude_rate=crude_rate,
                cumulative_risk=cumulative_risk,
                deaths=deaths,
            )
        bq.execute()
コード例 #9
0
    def test_insert_success_case(self):

        b = BatchQuery()
        inst = TestMultiKeyModel.batch(b).create(partition=self.pkey, cluster=2, count=3, text='4')

        with self.assertRaises(TestMultiKeyModel.DoesNotExist):
            TestMultiKeyModel.get(partition=self.pkey, cluster=2)

        b.execute()

        TestMultiKeyModel.get(partition=self.pkey, cluster=2)
コード例 #10
0
ファイル: repos.py プロジェクト: weiguang-zz/stratege_engine
 def save_ts(self, ts_list: List[TSData]):
     b = BatchQuery()
     for ts_data in ts_list:
         func = TSFunctionRegistry.find_function(ts_data.ts_type_name)
         value_serialized: str = func.serialize(ts_data.values)
         TimeSeriesDataModel.batch(b).create(
             type=ts_data.ts_type_name,
             code=ts_data.code,
             visible_time=ts_data.visible_time,
             data=value_serialized)
     b.execute()
コード例 #11
0
    def test_insert_success_case(self):

        b = BatchQuery()
        inst = TestMultiKeyModel.batch(b).create(partition=self.pkey, cluster=2, count=3, text='4')

        with self.assertRaises(TestMultiKeyModel.DoesNotExist):
            TestMultiKeyModel.get(partition=self.pkey, cluster=2)

        b.execute()

        TestMultiKeyModel.get(partition=self.pkey, cluster=2)
コード例 #12
0
    def truncate_models(self):
        log.info('Purging data from all tables...')

        for model in self.get_cassandra_models():
            b = BatchQuery()

            for instance in model.objects.all():
                instance.batch(b).delete()

            b.execute()

        log.info('Done!')
コード例 #13
0
ファイル: repos.py プロジェクト: ajmal017/stratege_engine
 def save(self, ts_list: List[TSData]):
     b = BatchQuery()
     for ts_data in ts_list:
         func = TSTypeRegistry.find_function(ts_data.ts_type_name)
         if not isinstance(func, HistoryTimeSeriesType):
             raise RuntimeError("非法的tsType")
         value_serialized: str = func.serialize(ts_data.values)
         TimeSeriesDataModel.batch(b).create(
             type=ts_data.ts_type_name,
             code=ts_data.code,
             visible_time=ts_data.visible_time,
             data=value_serialized)
     b.execute()
コード例 #14
0
    def test_async_batch(self):
        b = BatchQuery(execute_async=True)
        TestMultiKeyModel.batch(b).create(partition=self.pkey,
                                          cluster=2,
                                          count=3,
                                          text='4')

        with self.assertRaises(TestMultiKeyModel.DoesNotExist):
            TestMultiKeyModel.get(partition=self.pkey, cluster=2)

        b.execute()

        TestMultiKeyModel.get(partition=self.pkey, cluster=2)
コード例 #15
0
ファイル: ingest.py プロジェクト: wrt2dc/drastic
    def process_create_entry_work(self, rdict, context, do_load):
        b = BatchQuery()
        # MOSTLY the resource will not exist. So start by calculating the URL and trying to insert the entire record.
        if not do_load:
            url = u"file://{}{}/{}".format(decode_str(context['local_ip']),
                                           decode_str(context['path']),
                                           decode_str(context['entry']))
        else:
            with open(context['fullpath'], 'r') as f:
                blob = Blob.create_from_file(f, rdict['size'])
                if blob:
                    url = "cassandra://{}".format(blob.id)
                else:
                    return None

        try:
            # OK -- try to insert ( create ) the record...
            t1 = time.time()
            resource = Resource.batch(b).create(url=url, **rdict)
            msg = u'Resource {} created --> {}'.format(resource.name,
                                                       time.time() - t1)
            logger.info(msg)
        except ResourceConflictError:
            # If the create fails, the record already exists... so retrieve it...
            t1 = time.time()
            resource = Resource.objects().get(container=context['collection'],
                                              name=rdict['name'])
            msg = u"{} ::: Fetch Object -> {}".format(resource.name,
                                                      time.time() - t1)
            logger.info(msg)

        # if the url is not correct then update
        # TODO: If the URL is a block set that's stored internally, reduce its count so that it can be garbage collected
        # t3 = None
        if resource.url != url:
            t2 = time.time()
            # if url.startswith('cassandra://') : tidy up the stored block count...
            resource.batch(b).update(url=url)
            t3 = time.time()
            msg = u"{} ::: update -> {}".format(resource.name, t3 - t2)
            logger.info(msg)

        # t1 = time.time()
        SearchIndex.reset(resource.id)
        SearchIndex.index(resource, ['name', 'metadata'])

        # msg = "Index Management -> {}".format(time.time() - t1)
        # logger.info(msg)
        b.execute()
コード例 #16
0
    def test_callbacks_properly_execute_callables_and_tuples(self):

        call_history = []
        def my_callback(*args, **kwargs):
            call_history.append(args)

        # adding on init:
        batch = BatchQuery()

        batch.add_callback(my_callback)
        batch.add_callback(my_callback, 'more', 'args')

        batch.execute()

        assert len(call_history) == 2
        assert [(), ('more', 'args')] == call_history
コード例 #17
0
    def test_update_success_case(self):

        inst = TestMultiKeyModel.create(partition=self.pkey, cluster=2, count=3, text='4')

        b = BatchQuery()

        inst.count = 4
        inst.batch(b).save()

        inst2 = TestMultiKeyModel.get(partition=self.pkey, cluster=2)
        assert inst2.count == 3

        b.execute()

        inst3 = TestMultiKeyModel.get(partition=self.pkey, cluster=2)
        assert inst3.count == 4
コード例 #18
0
    def test_update_success_case(self):

        inst = TestMultiKeyModel.create(partition=self.pkey, cluster=2, count=3, text='4')

        b = BatchQuery()

        inst.count = 4
        inst.batch(b).save()

        inst2 = TestMultiKeyModel.get(partition=self.pkey, cluster=2)
        self.assertEqual(inst2.count, 3)

        b.execute()

        inst3 = TestMultiKeyModel.get(partition=self.pkey, cluster=2)
        self.assertEqual(inst3.count, 4)
コード例 #19
0
    def test_callbacks_properly_execute_callables_and_tuples(self):

        call_history = []
        def my_callback(*args, **kwargs):
            call_history.append(args)

        # adding on init:
        batch = BatchQuery()

        batch.add_callback(my_callback)
        batch.add_callback(my_callback, 'more', 'args')

        batch.execute()

        self.assertEqual(len(call_history), 2)
        self.assertEqual([(), ('more', 'args')], call_history)
コード例 #20
0
ファイル: cassandra_insert.py プロジェクト: lalaique/big-data
def populate_daily_revenue_table():
    sync_table(DailyRevenue)
    daily_revenue = pd.read_csv(DAILY_REVENUE_TABLE_FILE_NAME)

    batch_size = 200
    batch_current_file_count = 0
    batch_manager = BatchQuery()
    for index, row in tqdm(daily_revenue.iterrows(),
                           total=daily_revenue.shape[0]):
        DailyRevenue.batch(batch_manager) \
            .create(invoice_date=datetime.strptime(row['invoice_date'], "%Y-%m-%d"),
                    daily_revenue=row['daily_revenue'])
        batch_current_file_count += 1
        if batch_current_file_count == batch_size:
            batch_manager.execute()
            batch_current_file_count = 0
    batch_manager.execute()
コード例 #21
0
 def append(self, sequenced_item_or_items):
     if isinstance(sequenced_item_or_items, list):
         if len(sequenced_item_or_items):
             b = BatchQuery()
             for item in sequenced_item_or_items:
                 assert isinstance(item, self.sequenced_item_class), (type(item), self.sequenced_item_class)
                 kwargs = self.get_field_kwargs(item)
                 self.active_record_class.batch(b).if_not_exists().create(**kwargs)
             try:
                 b.execute()
             except LWTException as e:
                 self.raise_sequenced_item_error(sequenced_item_or_items, e)
     else:
         active_record = self.to_active_record(sequenced_item_or_items)
         try:
             active_record.save()
         except LWTException as e:
             self.raise_sequenced_item_error(sequenced_item_or_items, e)
コード例 #22
0
ファイル: cassandra_insert.py プロジェクト: lalaique/big-data
def populate_product_sale_counts_table():
    sync_table(ProductSaleCounts)
    product_sale_counts = pd.read_csv(PRODUCT_SALE_COUNTS_FILE_NAME)

    batch_size = 200
    batch_current_file_count = 0
    batch_manager = BatchQuery()
    for index, row in tqdm(product_sale_counts.iterrows(),
                           total=product_sale_counts.shape[0]):
        ProductSaleCounts.batch(batch_manager) \
            .create(product_code=str(row['product_code']),
                    product_total_quantity_sold=row['product_total_quantity_sold'],
                    product_description=row['product_description'])
        batch_current_file_count += 1
        if batch_current_file_count == batch_size:
            batch_manager.execute()
            batch_current_file_count = 0
    batch_manager.execute()
コード例 #23
0
ファイル: cassandra_insert.py プロジェクト: lalaique/big-data
def populate_product_revenue_table():
    sync_table(ProductRevenue)
    product_revenue = pd.read_csv(PRODUCT_REVENUE_TABLE_FILE_NAME)

    batch_size = 200
    batch_current_file_count = 0
    batch_manager = BatchQuery()
    for index, row in tqdm(product_revenue.iterrows(),
                           total=product_revenue.shape[0]):
        ProductRevenue.batch(batch_manager) \
            .create(product_code=str(row['product_code']),
                    product_total_revenue=row['product_total_revenue'],
                    product_description=row['product_description'])
        batch_current_file_count += 1
        if batch_current_file_count == batch_size:
            batch_manager.execute()
            batch_current_file_count = 0
    batch_manager.execute()
コード例 #24
0
    def test_batch_update_conditional_several_rows(self):
        sync_table(TestUpdateModel)
        self.addCleanup(drop_table, TestUpdateModel)

        first_row = TestUpdateModel.create(partition=1, cluster=1, value=5, text="something")
        second_row = TestUpdateModel.create(partition=1, cluster=2, value=5, text="something")

        b = BatchQuery()
        TestUpdateModel.batch(b).if_not_exists().create(partition=1, cluster=1, value=5, text='something else')
        TestUpdateModel.batch(b).if_not_exists().create(partition=1, cluster=2, value=5, text='something else')
        TestUpdateModel.batch(b).if_not_exists().create(partition=1, cluster=3, value=5, text='something else')

        # The response will be more than two rows because two of the inserts will fail
        with self.assertRaises(LWTException):
            b.execute()

        first_row.delete()
        second_row.delete()
        b.execute()
コード例 #25
0
def batch_save(models, ttl=None):
    ''' Executes the batch query '''
    if len(models) <= MAX_BATCH_SIZE:
        b = BatchQuery()
        for m in models:
            if (ttl is not None):
                m.batch(b).ttl(ttl).save()
            else:
                m.batch(b).save()
        b.execute()
    else:
        start = 0
        end = MAX_BATCH_SIZE
        while end <= len(models):
            batch_save(models[start:end])
            start = end
            if end == len(models):
                break
            end = len(models) if (
                end + MAX_BATCH_SIZE) > len(models) else end + MAX_BATCH_SIZE
コード例 #26
0
    def test_batch_insert_if_not_exists_success(self):
        """ tests that batch insertion with if_not_exists work as expected """

        id = uuid4()

        with BatchQuery() as b:
            TestIfNotExistsModel.batch(b).if_not_exists().create(
                id=id, count=8, text='123456789')

        b = BatchQuery()
        TestIfNotExistsModel.batch(b).if_not_exists().create(
            id=id, count=9, text='111111111111')
        with self.assertRaises(LWTException):
            b.execute()

        q = TestIfNotExistsModel.objects(id=id)
        self.assertEqual(len(q), 1)

        tm = q.first()
        self.assertEqual(tm.count, 8)
        self.assertEqual(tm.text, '123456789')
コード例 #27
0
ファイル: manager.py プロジェクト: marchon/eventsourcing
 def append(self, sequenced_item_or_items):
     if isinstance(sequenced_item_or_items, list):
         if len(sequenced_item_or_items):
             b = BatchQuery()
             for item in sequenced_item_or_items:
                 assert isinstance(
                     item,
                     self.sequenced_item_class), (type(item),
                                                  self.sequenced_item_class)
                 kwargs = self.get_field_kwargs(item)
                 self.record_class.batch(b).if_not_exists().create(**kwargs)
             try:
                 b.execute()
             except LWTException:
                 self.raise_sequenced_item_conflict()
     else:
         record = self.to_record(sequenced_item_or_items)
         try:
             record.save()
         except LWTException:
             self.raise_sequenced_item_conflict()
コード例 #28
0
    def test_batch_update_conditional(self):
        t = TestConditionalModel.create(text='something', count=5)
        id = t.id
        with BatchQuery() as b:
            t.batch(b).iff(count=5).update(text='something else')

        updated = TestConditionalModel.objects(id=id).first()
        self.assertEqual(updated.text, 'something else')

        b = BatchQuery()
        updated.batch(b).iff(count=6).update(text='and another thing')
        with self.assertRaises(LWTException) as assertion:
            b.execute()

        self.assertEqual(assertion.exception.existing, {
            'id': id,
            'count': 5,
            '[applied]': False,
        })

        updated = TestConditionalModel.objects(id=id).first()
        self.assertEqual(updated.text, 'something else')
コード例 #29
0
    def test_batch_update_conditional(self):
        t = TestConditionalModel.create(text='something', count=5)
        id = t.id
        with BatchQuery() as b:
            t.batch(b).iff(count=5).update(text='something else')

        updated = TestConditionalModel.objects(id=id).first()
        self.assertEqual(updated.text, 'something else')

        b = BatchQuery()
        updated.batch(b).iff(count=6).update(text='and another thing')
        with self.assertRaises(LWTException) as assertion:
            b.execute()

        self.assertEqual(assertion.exception.existing, {
            'id': id,
            'count': 5,
            '[applied]': False,
        })

        updated = TestConditionalModel.objects(id=id).first()
        self.assertEqual(updated.text, 'something else')
コード例 #30
0
    def test_empty_batch(self):
        b = BatchQuery()
        b.execute()

        with BatchQuery() as b:
            pass
コード例 #31
0
                    cord_id=row.cord_id,
                    acronym=row.acronym,
                    kpi_name=row.kpi_name,
                    kpi_version=row.kpi_version,
                    value=row.value)
            else:
                PlmnKeyNotFound.batch(keys_not_found_queries).create(
                    kpi_basename=row.kpi_basename,
                    date=row.date,
                    cord_id=row.cord_id,
                    acronym=row.acronym,
                    kpi_name=row.kpi_name,
                    kpi_version=row.kpi_version,
                    value=row.value)
        try:
            bad_queries.execute()
            keys_not_found_queries.execute()
            good_queries.execute()
        except Exception as e:
            log_file.write(str(e))
            pass

        min_date -= step
        max_date -= step

    end_time = time.time()
    total_time += (end_time - start_time)
    print("Cord_id %s processed in %s" % (cord, (end_time - start_time)))
    log_file.write("Cord %s processed \n" % cord)
    print("Total processing time %s" % total_time)
コード例 #32
0
    def test_empty_batch(self):
        b = BatchQuery()
        b.execute()

        with BatchQuery() as b:
            pass
コード例 #33
0
    parentid = columns.Integer(primary_key=True)
    childid = columns.Integer(primary_key=True, clustering_order="DESC")
    type = columns.Text()


# Se connecte au keyspace geonames
connection.setup(['127.0.0.1'], "geonames", protocol_version=3)
sync_table(HierarchyModel)

# Importe les données :
# La table sera donc présentée sous la forme : [parentid, childid, type]
with open("geonames/hierarchy.txt", encoding='utf8', newline='') as csvfile:

    # Création du DictReader
    reader = csv.DictReader(csvfile,
                            fieldnames=['parentid', 'childid', 'type'],
                            delimiter='\t')

    # Ingestion par lots
    count = 0
    batch = BatchQuery()
    for row in reader:
        new_row = clean_row(row)
        HierarchyModel.batch(batch).create(**new_row)
        count += 1
        if not count % 1000:
            batch.execute()
            batch = BatchQuery()
            logger.info('Importés: {}'.format(count))

    batch.execute()